From 6a3c7f5c87854e948c3c234e5f5e745c7c553722 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 28 May 2020 11:05:13 +0300 Subject: [PATCH 001/151] btrfs: don't balance btree inode pages from buffered write path The call to btrfs_btree_balance_dirty has been there since the early days of BTRFS, when the btree was directly modified from the write path, hence dirtied btree inode pages. With the implementation of b888db2bd7b6 ("Btrfs: Add delayed allocation to the extent based page tree code") 13 years ago the btree is no longer modified from the write path, hence there is no point in calling this function. Just remove it. Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b0d2c976587e..52422e2b3344 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1800,8 +1800,6 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1) - btrfs_btree_balance_dirty(fs_info); pos += copied; num_written += copied; From 9e22b925985e71c6acf0dba03f9b99a56806a137 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 3 Apr 2020 16:40:34 +0300 Subject: [PATCH 002/151] btrfs: read stripe len directly in btrfs_rmap_block extent_map::orig_block_len contains the size of a physical stripe when it's used to describe block groups (calculated in read_one_chunk via calc_stripe_length or calculated in decide_stripe_size and then assigned to extent_map::orig_block_len in create_chunk). Exploit this fact to get the size directly rather than opencoding the calculations. No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c037ef514b64..956ae6283d33 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1657,19 +1657,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, return -EIO; map = em->map_lookup; - data_stripe_length = em->len; + data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID10) - data_stripe_length = div_u64(data_stripe_length, - map->num_stripes / map->sub_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - data_stripe_length = div_u64(data_stripe_length, map->num_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - data_stripe_length = div_u64(data_stripe_length, - nr_data_stripes(map)); + /* For RAID5/6 adjust to a full IO stripe length */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) io_stripe_size = map->stripe_len * nr_data_stripes(map); - } buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { From 96f9b0f2fa01c96e90abd4e981208a341a5da66e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 3 Apr 2020 16:40:35 +0300 Subject: [PATCH 003/151] btrfs: simplify checks when adding excluded ranges Adresses held in 'logical' array are always guaranteed to fall within the boundaries of the block group. That is, 'start' can never be smaller than cache->start. This invariant follows from the way the address are calculated in btrfs_rmap_block: stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64(stripe_nr, map->stripe_len); bytenr = chunk_start + stripe_nr * io_stripe_size; I.e it's always some IO stripe within the given chunk. Exploit this invariant to simplify the body of the loop by removing the unnecessary 'if' since its 'else' part is the one always executed. Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 956ae6283d33..e29e9629b246 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1741,25 +1741,12 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; while (nr--) { - u64 start, len; - - if (logical[nr] > cache->start + cache->length) - continue; - - if (logical[nr] + stripe_len <= cache->start) - continue; - - start = logical[nr]; - if (start < cache->start) { - start = cache->start; - len = (logical[nr] + stripe_len) - start; - } else { - len = min_t(u64, stripe_len, - cache->start + cache->length - start); - } + u64 len = min_t(u64, stripe_len, + cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = btrfs_add_excluded_extent(fs_info, start, len); + ret = btrfs_add_excluded_extent(fs_info, logical[nr], + len); if (ret) { kfree(logical); return ret; From 89d7da9bc592aa6a341d00f2d949615a89bb1eb7 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 2 Jun 2020 19:05:56 +0900 Subject: [PATCH 004/151] btrfs: get mapping tree directly from fsinfo in find_first_block_group We already have an fs_info in our function parameters, there's no need to do the maths again and get fs_info from the extent_root just to get the mapping_tree. Instead directly grab the mapping_tree from fs_info. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e29e9629b246..bfb5f4258336 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1566,7 +1566,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &root->fs_info->mapping_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, found_key.objectid, found_key.offset); From e3ba67a108ff5a2657990014d3ed4488bd665be6 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 2 Jun 2020 19:05:57 +0900 Subject: [PATCH 005/151] btrfs: factor out reading of bg from find_frist_block_group When find_first_block_group() finds a block group item in the extent-tree, it does a lookup of the object in the extent mapping tree and does further checks on the item. Factor out this step from find_first_block_group() so we can further simplify the code. While we're at it, we can also just return early in find_first_block_group(), if the tree slot isn't found. Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 102 ++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bfb5f4258336..11244b67434e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1532,21 +1532,70 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, + struct btrfs_path *path) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_block_group_item bg; + struct extent_buffer *leaf; + int slot; + u64 flags; + int ret = 0; + + slot = path->slots[0]; + leaf = path->nodes[0]; + + em_tree = &fs_info->mapping_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, key->objectid, key->offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(fs_info, + "logical %llu len %llu found bg but no related chunk", + key->objectid, key->offset); + return -ENOENT; + } + + if (em->start != key->objectid || em->len != key->offset) { + btrfs_err(fs_info, + "block group %llu len %llu mismatch with chunk %llu len %llu", + key->objectid, key->offset, em->start, em->len); + ret = -EUCLEAN; + goto out_free_em; + } + + read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), + sizeof(bg)); + flags = btrfs_stack_block_group_flags(&bg) & + BTRFS_BLOCK_GROUP_TYPE_MASK; + + if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + btrfs_err(fs_info, +"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", + key->objectid, key->offset, flags, + (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + ret = -EUCLEAN; + } + +out_free_em: + free_extent_map(em); + return ret; +} + static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key) { struct btrfs_root *root = fs_info->extent_root; - int ret = 0; + int ret; struct btrfs_key found_key; struct extent_buffer *leaf; - struct btrfs_block_group_item bg; - u64 flags; int slot; ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret < 0) - goto out; + return ret; while (1) { slot = path->slots[0]; @@ -1563,49 +1612,10 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - struct extent_map_tree *em_tree; - struct extent_map *em; - - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, found_key.objectid, - found_key.offset); - read_unlock(&em_tree->lock); - if (!em) { - btrfs_err(fs_info, - "logical %llu len %llu found bg but no related chunk", - found_key.objectid, found_key.offset); - ret = -ENOENT; - } else if (em->start != found_key.objectid || - em->len != found_key.offset) { - btrfs_err(fs_info, - "block group %llu len %llu mismatch with chunk %llu len %llu", - found_key.objectid, found_key.offset, - em->start, em->len); - ret = -EUCLEAN; - } else { - read_extent_buffer(leaf, &bg, - btrfs_item_ptr_offset(leaf, slot), - sizeof(bg)); - flags = btrfs_stack_block_group_flags(&bg) & - BTRFS_BLOCK_GROUP_TYPE_MASK; - - if (flags != (em->map_lookup->type & - BTRFS_BLOCK_GROUP_TYPE_MASK)) { - btrfs_err(fs_info, -"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", - found_key.objectid, - found_key.offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & - em->map_lookup->type)); - ret = -EUCLEAN; - } else { - ret = 0; - } - } - free_extent_map(em); - goto out; + ret = read_bg_from_eb(fs_info, &found_key, path); + break; } + path->slots[0]++; } out: From f22f457a1a58b95c22a870f5e45b3fd7b23ebe7b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 1 Jun 2020 19:12:27 +0100 Subject: [PATCH 006/151] btrfs: remove no longer necessary chunk mutex locking cases Initially when the 'removed' flag was added to a block group to avoid races between block group removal and fitrim, by commit 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation"), we had to lock the chunks mutex because we could be moving the block group from its current list, the pending chunks list, into the pinned chunks list, or we could just be adding it to the pinned chunks if it was not in the pending chunks list. Both lists were protected by the chunk mutex. However we no longer have those lists since commit 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree"), and locking the chunk mutex is no longer necessary because of that. The same happens at btrfs_unfreeze_block_group(), we lock the chunk mutex because the block group's extent map could be part of the pinned chunks list and the call to remove_extent_mapping() could be deleting it from that list, which used to be protected by that mutex. So just remove those lock and unlock calls as they are not needed anymore. Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 11244b67434e..31ca2cfb7e3e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1111,7 +1111,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret < 0) goto out; - mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; /* @@ -1143,8 +1142,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, remove_em = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); - mutex_unlock(&fs_info->chunk_mutex); - if (remove_em) { struct extent_map_tree *em_tree; @@ -3437,7 +3434,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_unlock(&block_group->lock); if (cleanup) { - mutex_lock(&fs_info->chunk_mutex); em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->start, @@ -3445,7 +3441,6 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) BUG_ON(!em); /* logic error, can't happen */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - mutex_unlock(&fs_info->chunk_mutex); /* once for us and once for the tree */ free_extent_map(em); From 69b0e093c7859681e8012727d75f639154a7cfb3 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 3 Jun 2020 18:10:18 +0800 Subject: [PATCH 007/151] btrfs: let btrfs_return_cluster_to_free_space() return void __btrfs_return_cluster_to_free_space() returns only 0. And all its parent functions don't need the return value either so make this a void function. Further, as none of the callers of btrfs_return_cluster_to_free_space() is actually using the return from this function, make this function also return void. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 14 +++++--------- fs/btrfs/free-space-cache.h | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 55955bd424d7..5442b6e4d490 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2703,8 +2703,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group) * pointed to by the cluster, someone else raced in and freed the * cluster already. In that case, we just return without changing anything */ -static int -__btrfs_return_cluster_to_free_space( +static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { @@ -2756,7 +2755,6 @@ __btrfs_return_cluster_to_free_space( out: spin_unlock(&cluster->lock); btrfs_put_block_group(block_group); - return 0; } static void __btrfs_remove_free_space_cache_locked( @@ -2907,12 +2905,11 @@ out: * Otherwise, it'll get a reference on the block group pointed to by the * cluster and remove the cluster from it. */ -int btrfs_return_cluster_to_free_space( +void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl; - int ret; /* first, get a safe pointer to the block group */ spin_lock(&cluster->lock); @@ -2920,12 +2917,12 @@ int btrfs_return_cluster_to_free_space( block_group = cluster->block_group; if (!block_group) { spin_unlock(&cluster->lock); - return 0; + return; } } else if (cluster->block_group != block_group) { /* someone else has already freed it don't redo their work */ spin_unlock(&cluster->lock); - return 0; + return; } atomic_inc(&block_group->count); spin_unlock(&cluster->lock); @@ -2934,14 +2931,13 @@ int btrfs_return_cluster_to_free_space( /* now return any extents the cluster had on it */ spin_lock(&ctl->tree_lock); - ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); /* finally drop our ref */ btrfs_put_block_group(block_group); - return ret; } static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 2e0a8077aa74..e3d5e0ad8f8e 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -136,7 +136,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, u64 bytes, u64 min_start, u64 *max_extent_size); -int btrfs_return_cluster_to_free_space( +void btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group *block_group, From b5790d51809b3b0b5f0d012fa667b4e6cafd676e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 3 Jun 2020 18:10:20 +0800 Subject: [PATCH 008/151] btrfs: use helper btrfs_get_block_group Use the helper function where it is open coded to increment the block_group reference count As btrfs_get_block_group() is a one-liner we could have open-coded it, but its partner function btrfs_put_block_group() isn't one-liner which does the free part in it. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5442b6e4d490..6943bdd6fc62 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2924,7 +2924,7 @@ void btrfs_return_cluster_to_free_space( spin_unlock(&cluster->lock); return; } - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&cluster->lock); ctl = block_group->free_space_ctl; @@ -3354,7 +3354,7 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, list_del_init(&entry->list); if (!ret) { - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; From ab483009216ed556fb124176cae75242f8da0e2e Mon Sep 17 00:00:00 2001 From: Liao Pingfang Date: Thu, 11 Jun 2020 08:40:36 +0800 Subject: [PATCH 009/151] btrfs: check-integrity: remove unnecessary failure messages during memory allocation As there is a dump_stack() done on memory allocation failures, these messages might as well be deleted instead. Signed-off-by: Liao Pingfang Reviewed-by: David Sterba [ minor tweaks ] Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 32e11a23b47f..81a8c87a5afb 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -631,10 +631,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int pass; selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); - if (NULL == selected_super) { - pr_info("btrfsic: error, kmalloc failed!\n"); + if (!selected_super) return -ENOMEM; - } list_for_each_entry(device, dev_head, dev_list) { int i; @@ -795,7 +793,6 @@ static int btrfsic_process_superblock_dev_mirror( if (NULL == superblock_tmp) { superblock_tmp = btrfsic_block_alloc(); if (NULL == superblock_tmp) { - pr_info("btrfsic: error, kmalloc failed!\n"); ret = -1; goto out; } @@ -921,9 +918,7 @@ static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) struct btrfsic_stack_frame *sf; sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (NULL == sf) - pr_info("btrfsic: alloc memory failed!\n"); - else + if (sf) sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; return sf; } @@ -1313,7 +1308,6 @@ static int btrfsic_create_link_to_next_block( if (NULL == l) { l = btrfsic_block_link_alloc(); if (NULL == l) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(next_block_ctx); *next_blockp = NULL; return -1; @@ -1470,7 +1464,6 @@ static int btrfsic_handle_extent_data( mirror_num, &block_was_created); if (NULL == next_block) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&next_block_ctx); return -1; } @@ -2013,7 +2006,6 @@ again: block = btrfsic_block_alloc(); if (NULL == block) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&block_ctx); goto continue_loop; } @@ -2234,7 +2226,6 @@ static int btrfsic_process_written_superblock( mirror_num, &was_created); if (NULL == next_block) { - pr_info("btrfsic: error, kmalloc failed!\n"); btrfsic_release_block_ctx(&tmp_next_block_ctx); return -1; } @@ -2542,10 +2533,8 @@ static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( &state->block_link_hashtable); if (NULL == l) { l = btrfsic_block_link_alloc(); - if (NULL == l) { - pr_info("btrfsic: error, kmalloc failed!\n"); + if (!l) return NULL; - } l->block_ref_to = next_block; l->block_ref_from = from_block; @@ -2589,10 +2578,9 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( struct btrfsic_dev_state *dev_state; block = btrfsic_block_alloc(); - if (NULL == block) { - pr_info("btrfsic: error, kmalloc failed!\n"); + if (!block) return NULL; - } + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); if (NULL == dev_state) { pr_info("btrfsic: error, lookup dev_state failed!\n"); @@ -2797,10 +2785,8 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, return -1; } state = kvzalloc(sizeof(*state), GFP_KERNEL); - if (!state) { - pr_info("btrfs check-integrity: allocation failed!\n"); + if (!state) return -ENOMEM; - } if (!btrfsic_is_initialized) { mutex_init(&btrfsic_mutex); @@ -2829,7 +2815,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, ds = btrfsic_dev_state_alloc(); if (NULL == ds) { - pr_info("btrfs check-integrity: kmalloc() failed!\n"); mutex_unlock(&btrfsic_mutex); return -ENOMEM; } From 46d4dac888ebe083d61f18acb16a6988e9062268 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 9 Jun 2020 11:19:33 +0100 Subject: [PATCH 010/151] btrfs: remove the start argument from btrfs_free_reserved_data_space_noquota() The start argument for btrfs_free_reserved_data_space_noquota() is only used to make sure the amount of bytes we decrement from the bytes_may_use counter of the data space_info object is aligned to the filesystem's sector size. It serves no other purpose. All its current callers always pass a length argument that is already aligned to the sector size, so we can make the start argument go away. In fact its presence makes it impossible to use it in a context where we just want to free a number of bytes for a range for which either we do not know its start offset or for freeing multiple ranges at once (which are not contiguous). This change is preparatory work for a patch (third patch in this series) that makes relocation of data block groups that are not full reserve less data space. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 11 ++++------- fs/btrfs/delalloc-space.h | 2 +- fs/btrfs/inode.c | 5 ++--- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 1245739a3a6e..d05648f882ca 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -255,7 +255,7 @@ int btrfs_check_data_free_space(struct inode *inode, /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_free_reserved_data_space_noquota(inode, len); else ret = 0; return ret; @@ -269,16 +269,13 @@ int btrfs_check_data_free_space(struct inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_space_info *data_sinfo; - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); + ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); @@ -303,7 +300,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, round_down(start, root->fs_info->sectorsize); start = round_down(start, root->fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_free_reserved_data_space_noquota(inode, len); btrfs_qgroup_free_data(inode, reserved, start, len); } diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 54466fbd7075..fe8c6aafb25b 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -13,7 +13,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, void btrfs_delalloc_release_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6862cd7e21a9..07d20f634467 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2087,7 +2087,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota( &inode->vfs_inode, - state->start, len); + len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -7278,8 +7278,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * use the existing or preallocated extent, so does not * need to adjust btrfs_space_info's bytes_may_use. */ - btrfs_free_reserved_data_space_noquota(inode, start, - len); + btrfs_free_reserved_data_space_noquota(inode, len); goto skip_cow; } } From a89ef455dd2b59f476bdda4786cca54ed03cad14 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 9 Jun 2020 11:19:42 +0100 Subject: [PATCH 011/151] btrfs: use btrfs_alloc_data_chunk_ondemand() when allocating space for relocation We currently use btrfs_check_data_free_space() when allocating space for relocating data extents, but that is not necessary because that function combines btrfs_alloc_data_chunk_ondemand(), which does the actual space reservation, and btrfs_qgroup_reserve_data(). We can use btrfs_alloc_data_chunk_ondemand() directly because we know we do not need to reserve qgroup space since we are dealing with a relocation tree, which can never have qgroups (btrfs_qgroup_reserve_data() does nothing as is_fstree() returns false for a relocation tree). Conversely we can use btrfs_free_reserved_data_space_noquota() directly instead of btrfs_free_reserved_data_space(), since we had no qgroup reservation when allocating space. This change is preparatory work for another patch in this series that makes relocation reserve the exact amount of space it needs to relocate a data block group. The function btrfs_check_data_free_space() has the incovenient of requiring a start offset argument and we will want to be able to allocate space for multiple ranges, which are not consecutive, at once. Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3bbae80c752f..11d156995446 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2585,13 +2585,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset; - struct extent_changeset *data_reserved = NULL; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start, - prealloc_end + 1 - prealloc_start); + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + prealloc_end + 1 - prealloc_start); if (ret) goto out; @@ -2606,8 +2605,8 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; if (cur_offset < start) - btrfs_free_reserved_data_space(inode, data_reserved, - cur_offset, start - cur_offset); + btrfs_free_reserved_data_space_noquota(inode, + start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); @@ -2618,11 +2617,10 @@ int prealloc_file_extent_cluster(struct inode *inode, nr++; } if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space(inode, data_reserved, - cur_offset, prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space_noquota(inode, + prealloc_end + 1 - cur_offset); out: inode_unlock(inode); - extent_changeset_free(data_reserved); return ret; } From 74ef00185eb864252156022ff129b01549504175 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 4 Jun 2020 15:18:06 +0800 Subject: [PATCH 012/151] btrfs: introduce "rescue=" mount option This patch introduces a new "rescue=" mount option group for all mount options for data recovery. Different rescue sub options are seperated by ':'. E.g "ro,rescue=nologreplay:usebackuproot". The original plan was to use ';', but ';' needs to be escaped/quoted, or it will be interpreted by bash, similar to '|'. And obviously, user can specify rescue options one by one like: "ro,rescue=nologreplay,rescue=usebackuproot". The following mount options are converted to "rescue=", old mount options are deprecated but still available for compatibility purpose: - usebackuproot Now it's "rescue=usebackuproot" - nologreplay Now it's "rescue=nologreplay" Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 81 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c3826ae883f0..76ab6d5d01a9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -326,7 +326,6 @@ enum { Opt_defrag, Opt_nodefrag, Opt_discard, Opt_nodiscard, Opt_discard_mode, - Opt_nologreplay, Opt_norecovery, Opt_ratio, Opt_rescan_uuid_tree, @@ -340,9 +339,13 @@ enum { Opt_subvolid, Opt_thread_pool, Opt_treelog, Opt_notreelog, - Opt_usebackuproot, Opt_user_subvol_rm_allowed, + /* Rescue options */ + Opt_rescue, + Opt_usebackuproot, + Opt_nologreplay, + /* Deprecated options */ Opt_alloc_start, Opt_recovery, @@ -390,7 +393,6 @@ static const match_table_t tokens = { {Opt_discard, "discard"}, {Opt_discard_mode, "discard=%s"}, {Opt_nodiscard, "nodiscard"}, - {Opt_nologreplay, "nologreplay"}, {Opt_norecovery, "norecovery"}, {Opt_ratio, "metadata_ratio=%u"}, {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, @@ -408,9 +410,15 @@ static const match_table_t tokens = { {Opt_thread_pool, "thread_pool=%u"}, {Opt_treelog, "treelog"}, {Opt_notreelog, "notreelog"}, - {Opt_usebackuproot, "usebackuproot"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + /* Rescue options */ + {Opt_rescue, "rescue=%s"}, + /* Deprecated, with alias rescue=nologreplay */ + {Opt_nologreplay, "nologreplay"}, + /* Deprecated, with alias rescue=usebackuproot */ + {Opt_usebackuproot, "usebackuproot"}, + /* Deprecated options */ {Opt_alloc_start, "alloc_start=%s"}, {Opt_recovery, "recovery"}, @@ -433,6 +441,55 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +static const match_table_t rescue_tokens = { + {Opt_usebackuproot, "usebackuproot"}, + {Opt_nologreplay, "nologreplay"}, + {Opt_err, NULL}, +}; + +static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) +{ + char *opts; + char *orig; + char *p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0; + + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(&opts, ":")) != NULL) { + int token; + + if (!*p) + continue; + token = match_token(p, rescue_tokens, args); + switch (token){ + case Opt_usebackuproot: + btrfs_info(info, + "trying to use backup root at mount time"); + btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + break; + case Opt_nologreplay: + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; + case Opt_err: + btrfs_info(info, "unrecognized rescue option '%s'", p); + ret = -EINVAL; + goto out; + default: + break; + } + + } +out: + kfree(orig); + return ret; +} + /* * Regular mount options parser. Everything that is needed only when * reading in a new superblock is parsed here. @@ -689,6 +746,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_norecovery: case Opt_nologreplay: + btrfs_warn(info, + "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; @@ -791,10 +850,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, "disabling auto defrag"); break; case Opt_recovery: - btrfs_warn(info, - "'recovery' is deprecated, use 'usebackuproot' instead"); - fallthrough; case Opt_usebackuproot: + btrfs_warn(info, + "'%s' is deprecated, use 'rescue=usebackuproot' instead", + token == Opt_recovery ? "recovery" : + "usebackuproot"); btrfs_info(info, "trying to use backup root at mount time"); btrfs_set_opt(info->mount_opt, USEBACKUPROOT); @@ -859,6 +919,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } info->commit_interval = intarg; break; + case Opt_rescue: + ret = parse_rescue_options(info, args[0].from); + if (ret < 0) + goto out; + break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: btrfs_info(info, "fragmenting all space"); @@ -1344,7 +1409,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) - seq_puts(seq, ",nologreplay"); + seq_puts(seq, ",rescue=nologreplay"); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) From a8b3a89074f8677533cdb3843da121d697c1938c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:26:07 +0200 Subject: [PATCH 013/151] btrfs: scrub: remove kmap/kunmap of pages All pages that scrub uses in the scrub_block::pagev array are allocated with GFP_KERNEL and never part of any mapping, so kmap is not necessary, we only need to know the page address. In scrub_write_page_to_dev_replace we don't even need to call flush_dcache_page because of the same reason as above. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 016a025e36c7..368791b17bac 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1616,13 +1616,9 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, struct scrub_page *spage = sblock->pagev[page_num]; BUG_ON(spage->page == NULL); - if (spage->io_error) { - void *mapped_buffer = kmap_atomic(spage->page); + if (spage->io_error) + clear_page(page_address(spage->page)); - clear_page(mapped_buffer); - flush_dcache_page(spage->page); - kunmap_atomic(mapped_buffer); - } return scrub_add_page_to_wr_bio(sblock->sctx, spage); } @@ -1805,7 +1801,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; - buffer = kmap_atomic(page); + buffer = page_address(page); len = sctx->fs_info->sectorsize; index = 0; @@ -1813,7 +1809,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) u64 l = min_t(u64, len, PAGE_SIZE); crypto_shash_update(shash, buffer, l); - kunmap_atomic(buffer); len -= l; if (len == 0) break; @@ -1821,7 +1816,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) BUG_ON(index >= sblock->page_count); BUG_ON(!sblock->pagev[index]->page); page = sblock->pagev[index]->page; - buffer = kmap_atomic(page); + buffer = page_address(page); } crypto_shash_final(shash, csum); @@ -1851,7 +1846,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; - mapped_buffer = kmap_atomic(page); + mapped_buffer = page_address(page); h = (struct btrfs_header *)mapped_buffer; memcpy(on_disk_csum, h->csum, sctx->csum_size); @@ -1883,7 +1878,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) u64 l = min_t(u64, len, mapped_size); crypto_shash_update(shash, p, l); - kunmap_atomic(mapped_buffer); len -= l; if (len == 0) break; @@ -1891,7 +1885,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BUG_ON(index >= sblock->page_count); BUG_ON(!sblock->pagev[index]->page); page = sblock->pagev[index]->page; - mapped_buffer = kmap_atomic(page); + mapped_buffer = page_address(page); mapped_size = PAGE_SIZE; p = mapped_buffer; } @@ -1925,7 +1919,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; - mapped_buffer = kmap_atomic(page); + mapped_buffer = page_address(page); s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); @@ -1946,7 +1940,6 @@ static int scrub_checksum_super(struct scrub_block *sblock) u64 l = min_t(u64, len, mapped_size); crypto_shash_update(shash, p, l); - kunmap_atomic(mapped_buffer); len -= l; if (len == 0) break; @@ -1954,7 +1947,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) BUG_ON(index >= sblock->page_count); BUG_ON(!sblock->pagev[index]->page); page = sblock->pagev[index]->page; - mapped_buffer = kmap_atomic(page); + mapped_buffer = page_address(page); mapped_size = PAGE_SIZE; p = mapped_buffer; } From b04852520ec260dd4080864cd2b309d163f76b5e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:32:51 +0200 Subject: [PATCH 014/151] btrfs: scrub: unify naming of page address variables As the page mapping has been removed, rename the variables to 'kaddr' that we use everywhere else. The type is changed to 'char *' so pointer arithmetic works without casts. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 368791b17bac..7dc4a090db57 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1788,7 +1788,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; - void *buffer; + char *kaddr; u64 len; int index; @@ -1801,14 +1801,14 @@ static int scrub_checksum_data(struct scrub_block *sblock) on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; - buffer = page_address(page); + kaddr = page_address(page); len = sctx->fs_info->sectorsize; index = 0; for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crypto_shash_update(shash, buffer, l); + crypto_shash_update(shash, kaddr, l); len -= l; if (len == 0) break; @@ -1816,7 +1816,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) BUG_ON(index >= sblock->page_count); BUG_ON(!sblock->pagev[index]->page); page = sblock->pagev[index]->page; - buffer = page_address(page); + kaddr = page_address(page); } crypto_shash_final(shash, csum); @@ -1835,7 +1835,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; - void *mapped_buffer; + char *kaddr; u64 mapped_size; void *p; u64 len; @@ -1846,8 +1846,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; - mapped_buffer = page_address(page); - h = (struct btrfs_header *)mapped_buffer; + kaddr = page_address(page); + h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->csum_size); /* @@ -1872,7 +1872,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + p = kaddr + BTRFS_CSUM_SIZE; index = 0; for (;;) { u64 l = min_t(u64, len, mapped_size); @@ -1885,9 +1885,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BUG_ON(index >= sblock->page_count); BUG_ON(!sblock->pagev[index]->page); page = sblock->pagev[index]->page; - mapped_buffer = page_address(page); + kaddr = page_address(page); mapped_size = PAGE_SIZE; - p = mapped_buffer; + p = kaddr; } crypto_shash_final(shash, calculated_csum); @@ -1906,7 +1906,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; - void *mapped_buffer; + char *kaddr; u64 mapped_size; void *p; int fail_gen = 0; @@ -1919,8 +1919,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; - mapped_buffer = page_address(page); - s = (struct btrfs_super_block *)mapped_buffer; + kaddr = page_address(page); + s = (struct btrfs_super_block *)kaddr; memcpy(on_disk_csum, s->csum, sctx->csum_size); if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) @@ -1934,7 +1934,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; + p = kaddr + BTRFS_CSUM_SIZE; index = 0; for (;;) { u64 l = min_t(u64, len, mapped_size); @@ -1947,9 +1947,9 @@ static int scrub_checksum_super(struct scrub_block *sblock) BUG_ON(index >= sblock->page_count); BUG_ON(!sblock->pagev[index]->page); page = sblock->pagev[index]->page; - mapped_buffer = page_address(page); + kaddr = page_address(page); mapped_size = PAGE_SIZE; - p = mapped_buffer; + p = kaddr; } crypto_shash_final(shash, calculated_csum); From 83cf6d5eae54eb40727210e825f8d656a33e7d30 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:40:36 +0200 Subject: [PATCH 015/151] btrfs: scrub: simplify superblock checksum calculation BTRFS_SUPER_INFO_SIZE is 4096, and fits to a page on all supported architectures, so we can calculate the checksum in one go. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7dc4a090db57..13ee43f29751 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1907,15 +1907,8 @@ static int scrub_checksum_super(struct scrub_block *sblock) u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; char *kaddr; - u64 mapped_size; - void *p; int fail_gen = 0; int fail_cor = 0; - u64 len; - int index; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; @@ -1932,27 +1925,11 @@ static int scrub_checksum_super(struct scrub_block *sblock) if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) ++fail_cor; - len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = kaddr + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); - crypto_shash_update(shash, p, l); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - kaddr = page_address(page); - mapped_size = PAGE_SIZE; - p = kaddr; - } - - crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; From 74710cf1fbdc74b1593be953e85b3f392c9f43a4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:43:14 +0200 Subject: [PATCH 016/151] btrfs: scrub: remove temporary csum array in scrub_checksum_super The page contents with the checksum is available during the entire function so we don't need to make a copy. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 13ee43f29751..abb39c5255d2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1904,7 +1904,6 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; char *kaddr; int fail_gen = 0; @@ -1914,7 +1913,6 @@ static int scrub_checksum_super(struct scrub_block *sblock) page = sblock->pagev[0]->page; kaddr = page_address(page); s = (struct btrfs_super_block *)kaddr; - memcpy(on_disk_csum, s->csum, sctx->csum_size); if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) ++fail_cor; @@ -1930,7 +1928,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) + if (memcmp(calculated_csum, s->csum, sctx->csum_size)) ++fail_cor; if (fail_cor + fail_gen) { From c7460541093494f108adb6388351636d353715fb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:47:05 +0200 Subject: [PATCH 017/151] btrfs: scrub: clean up temporary page variables in scrub_checksum_super Add proper variable for the scrub page and use it instead of repeatedly dereferencing the other structures. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index abb39c5255d2..aecaf5c7f655 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1904,23 +1904,23 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct page *page; + struct scrub_page *spage; char *kaddr; int fail_gen = 0; int fail_cor = 0; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0]->page; - kaddr = page_address(page); + spage = sblock->pagev[0]; + kaddr = page_address(spage->page); s = (struct btrfs_super_block *)kaddr; - if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) + if (spage->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (sblock->pagev[0]->generation != btrfs_super_generation(s)) + if (spage->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) + if (!scrub_check_fsid(s->fsid, spage)) ++fail_cor; shash->tfm = fs_info->csum_shash; @@ -1941,10 +1941,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + btrfs_dev_stat_inc_and_print(spage->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, + btrfs_dev_stat_inc_and_print(spage->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } From 771aba0d12dd945132e2c3bbb512f96edc0efbaa Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:54:41 +0200 Subject: [PATCH 018/151] btrfs: scrub: simplify data block checksum calculation We have sectorsize same as PAGE_SIZE, the checksum can be calculated in one go. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index aecaf5c7f655..16c83130d884 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1789,37 +1789,19 @@ static int scrub_checksum_data(struct scrub_block *sblock) u8 *on_disk_csum; struct page *page; char *kaddr; - u64 len; - int index; BUG_ON(sblock->page_count < 1); if (!sblock->pagev[0]->have_csum) return 0; - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; kaddr = page_address(page); - len = sctx->fs_info->sectorsize; - index = 0; - for (;;) { - u64 l = min_t(u64, len, PAGE_SIZE); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - crypto_shash_update(shash, kaddr, l); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - kaddr = page_address(page); - } - - crypto_shash_final(shash, csum); if (memcmp(csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; From d41ebef2005f3b1669da334183088668c168c74a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 16:20:35 +0200 Subject: [PATCH 019/151] btrfs: scrub: clean up temporary page variables in scrub_checksum_data Add proper variable for the scrub page and use it instead of repeatedly dereferencing the other structures. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 16c83130d884..19a64c72f38e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1786,23 +1786,21 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - u8 *on_disk_csum; - struct page *page; + struct scrub_page *spage; char *kaddr; BUG_ON(sblock->page_count < 1); - if (!sblock->pagev[0]->have_csum) + spage = sblock->pagev[0]; + if (!spage->have_csum) return 0; - on_disk_csum = sblock->pagev[0]->csum; - page = sblock->pagev[0]->page; - kaddr = page_address(page); + kaddr = page_address(spage->page); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); - if (memcmp(csum, on_disk_csum, sctx->csum_size)) + if (memcmp(csum, spage->csum, sctx->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; From 521e1022274000af1981411673e4019d44bdea7f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 15:54:41 +0200 Subject: [PATCH 020/151] btrfs: scrub: simplify tree block checksum calculation Use a simpler iteration over tree block pages, same what csum_tree_block does: first page always exists, loop over the rest. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 19a64c72f38e..663bb2c22c50 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1814,15 +1814,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; + const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; + int i; struct page *page; char *kaddr; - u64 mapped_size; - void *p; - u64 len; - int index; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; @@ -1850,24 +1845,14 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) BTRFS_UUID_SIZE)) sblock->header_error = 1; - len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; - mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; - p = kaddr + BTRFS_CSUM_SIZE; - index = 0; - for (;;) { - u64 l = min_t(u64, len, mapped_size); + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, + PAGE_SIZE - BTRFS_CSUM_SIZE); - crypto_shash_update(shash, p, l); - len -= l; - if (len == 0) - break; - index++; - BUG_ON(index >= sblock->page_count); - BUG_ON(!sblock->pagev[index]->page); - page = sblock->pagev[index]->page; - kaddr = page_address(page); - mapped_size = PAGE_SIZE; - p = kaddr; + for (i = 1; i < num_pages; i++) { + kaddr = page_address(sblock->pagev[i]->page); + crypto_shash_update(shash, kaddr, PAGE_SIZE); } crypto_shash_final(shash, calculated_csum); From 100aa5d9f9f9d1163218bbbaad21bffbd8ee3e8d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 29 May 2020 16:20:35 +0200 Subject: [PATCH 021/151] btrfs: scrub: clean up temporary page variables in scrub_checksum_tree_block Add proper variable for the scrub page and use it instead of repeatedly dereferencing the other structures. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 663bb2c22c50..d935ac06323f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1816,12 +1816,12 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) u8 on_disk_csum[BTRFS_CSUM_SIZE]; const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT; int i; - struct page *page; + struct scrub_page *spage; char *kaddr; BUG_ON(sblock->page_count < 1); - page = sblock->pagev[0]->page; - kaddr = page_address(page); + spage = sblock->pagev[0]; + kaddr = page_address(spage->page); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->csum_size); @@ -1830,15 +1830,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) + if (spage->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { + if (spage->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) + if (!scrub_check_fsid(h->fsid, spage)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, From 203f44c51982b80c437b49b32c843597c112f287 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jun 2020 09:04:40 +0800 Subject: [PATCH 022/151] btrfs: inode: refactor the parameters of insert_reserved_file_extent() Function insert_reserved_file_extent() takes a long list of parameters, which are all for btrfs_file_extent_item, even including two reserved members, encryption and other_encoding. This makes the parameter list unnecessary long for a function which only gets called twice. This patch will refactor the parameter list, by using btrfs_file_extent_item as parameter directly to hugely reduce the number of parameters. Also, since there are only two callers, one in btrfs_finish_ordered_io() which inserts file extent for ordered extent, and one __btrfs_prealloc_file_range(). These two call sites have completely different context, where ordered extent can be compressed, but will always be regular extent, while the preallocated one is never going to be compressed and always has PREALLOC type. So use two small wrapper for these two different call sites to improve readability. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +++- fs/btrfs/inode.c | 94 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 68 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d404cce8ae40..91a6c2b34c60 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2232,7 +2232,8 @@ static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) } /* struct btrfs_file_extent_item */ -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, @@ -2241,6 +2242,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, @@ -2257,6 +2260,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; } +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 07d20f634467..0c4f5f796ea6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2467,16 +2467,16 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, - u64 disk_bytenr, u64 disk_num_bytes, - u64 num_bytes, u64 ram_bytes, - u8 compression, u8 encryption, - u16 other_encoding, int extent_type) + struct btrfs_file_extent_item *stack_fi) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_file_extent_item *fi; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; + u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); + u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); + u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); + u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); u64 qg_released; int extent_inserted = 0; int ret; @@ -2496,7 +2496,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, */ ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, file_pos + num_bytes, NULL, 0, - 1, sizeof(*fi), &extent_inserted); + 1, sizeof(*stack_fi), &extent_inserted); if (ret) goto out; @@ -2507,23 +2507,15 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*fi)); + sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); - btrfs_set_file_extent_offset(leaf, fi, 0); - btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); - btrfs_set_file_extent_compression(leaf, fi, compression); - btrfs_set_file_extent_encryption(leaf, fi, encryption); - btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); + write_extent_buffer(leaf, stack_fi, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(struct btrfs_file_extent_item)); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -2571,7 +2563,33 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, btrfs_put_block_group(cache); } -/* as ordered data IO finishes, this gets called so we can finish +static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_ordered_extent *oe) +{ + struct btrfs_file_extent_item stack_fi; + u64 logical_len; + + memset(&stack_fi, 0, sizeof(stack_fi)); + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, + oe->disk_num_bytes); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) + logical_len = oe->truncated_len; + else + logical_len = oe->num_bytes; + btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len); + btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); + /* Encryption and other encoding is reserved and all 0 */ + + return insert_reserved_file_extent(trans, inode, oe->file_offset, + &stack_fi); +} + +/* + * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ @@ -2673,12 +2691,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, start, - ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, - logical_len, logical_len, - compress_type, 0, 0, - BTRFS_FILE_EXTENT_REG); + ret = insert_ordered_extent_file_extent(trans, inode, + ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, @@ -9583,6 +9597,27 @@ out_unlock: return err; } +static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_key *ins, + u64 file_offset) +{ + struct btrfs_file_extent_item stack_fi; + u64 start = ins->objectid; + u64 len = ins->offset; + + memset(&stack_fi, 0, sizeof(stack_fi)); + + btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); + btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); + btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); + btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); + /* Encryption and other encoding is reserved and all 0 */ + + return insert_reserved_file_extent(trans, inode, file_offset, + &stack_fi); +} static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9641,11 +9676,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_reserved_file_extent(trans, inode, - cur_offset, ins.objectid, - ins.offset, ins.offset, - ins.offset, 0, 0, 0, - BTRFS_FILE_EXTENT_PREALLOC); + ret = insert_prealloc_file_extent(trans, inode, &ins, + cur_offset); if (ret) { btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); From 9729f10a608f235779f060636f32c87766ec615d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jun 2020 09:04:41 +0800 Subject: [PATCH 023/151] btrfs: inode: move qgroup reserved space release to the callers of insert_reserved_file_extent() This is to prepare for the incoming timing change of qgroup reserved data space and ordered extent. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0c4f5f796ea6..516192eccf52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2467,7 +2467,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, - struct btrfs_file_extent_item *stack_fi) + struct btrfs_file_extent_item *stack_fi, + u64 qgroup_reserved) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; @@ -2477,7 +2478,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); - u64 qg_released; int extent_inserted = 0; int ret; @@ -2531,17 +2531,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) goto out; - /* - * Release the reserved range from inode dirty range map, as it is - * already moved into delayed_ref_head - */ - ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes); - if (ret < 0) - goto out; - qg_released = ret; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), - file_pos, qg_released, &ins); + file_pos, qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2569,6 +2561,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_file_extent_item stack_fi; u64 logical_len; + int ret; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); @@ -2584,8 +2577,11 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ + ret = btrfs_qgroup_release_data(inode, oe->file_offset, logical_len); + if (ret < 0) + return ret; return insert_reserved_file_extent(trans, inode, oe->file_offset, - &stack_fi); + &stack_fi, ret); } /* @@ -9604,6 +9600,7 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item stack_fi; u64 start = ins->objectid; u64 len = ins->offset; + int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9615,8 +9612,11 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ + ret = btrfs_qgroup_release_data(inode, file_offset, len); + if (ret < 0) + return ret; return insert_reserved_file_extent(trans, inode, file_offset, - &stack_fi); + &stack_fi, ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, From a7f8b1c2ac21bf081b41264c9cfd6260dffa6246 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jun 2020 09:04:42 +0800 Subject: [PATCH 024/151] btrfs: file: reserve qgroup space after the hole punch range is locked The incoming qgroup reserved space timing will move the data reservation to ordered extent completely. However in btrfs_punch_hole_lock_range() will call btrfs_invalidate_page(), which will clear QGROUP_RESERVED bit for the range. In current stage it's OK, but if we're making ordered extents handle the reserved space, then btrfs_punch_hole_lock_range() can clear the QGROUP_RESERVED bit before we submit ordered extent, leading to qgroup reserved space leakage. So here change the timing to make reserve data space after btrfs_punch_hole_lock_range(). The new timing is fine for either current code or the new code. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 52422e2b3344..b15858e1d753 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3174,14 +3174,14 @@ reserve_space: if (ret < 0) goto out; space_reserved = true; - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, - alloc_start, bytes_to_reserve); - if (ret) - goto out; ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); if (ret) goto out; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + alloc_start, bytes_to_reserve); + if (ret) + goto out; ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, i_blocksize(inode), From 7dbeaad0af7d0a1a2a8e41d04e90964368ddfcc5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jun 2020 09:04:43 +0800 Subject: [PATCH 025/151] btrfs: change timing for qgroup reserved space for ordered extents to fix reserved space leak [BUG] The following simple workload from fsstress can lead to qgroup reserved data space leak: 0/0: creat f0 x:0 0 0 0/0: creat add id=0,parent=-1 0/1: write f0[259 1 0 0 0 0] [600030,27288] 0 0/4: dwrite - xfsctl(XFS_IOC_DIOINFO) f0[259 1 0 0 64 627318] return 25, fallback to stat() 0/4: dwrite f0[259 1 0 0 64 627318] [610304,106496] 0 This would cause btrfs qgroup to leak 20480 bytes for data reserved space. If btrfs qgroup limit is enabled, such leak can lead to unexpected early EDQUOT and unusable space. [CAUSE] When doing direct IO, kernel will try to writeback existing buffered page cache, then invalidate them: generic_file_direct_write() |- filemap_write_and_wait_range(); |- invalidate_inode_pages2_range(); However for btrfs, the bi_end_io hook doesn't finish all its heavy work right after bio ends. In fact, it delays its work further: submit_extent_page(end_io_func=end_bio_extent_writepage); end_bio_extent_writepage() |- btrfs_writepage_endio_finish_ordered() |- btrfs_init_work(finish_ordered_fn); <<< Work queue execution >>> finish_ordered_fn() |- btrfs_finish_ordered_io(); |- Clear qgroup bits This means, when filemap_write_and_wait_range() returns, btrfs_finish_ordered_io() is not guaranteed to be executed, thus the qgroup bits for related range are not cleared. Now into how the leak happens, this will only focus on the overlapping part of buffered and direct IO part. 1. After buffered write The inode had the following range with QGROUP_RESERVED bit: 596 616K |///////////////| Qgroup reserved data space: 20K 2. Writeback part for range [596K, 616K) Write back finished, but btrfs_finish_ordered_io() not get called yet. So we still have: 596K 616K |///////////////| Qgroup reserved data space: 20K 3. Pages for range [596K, 616K) get released This will clear all qgroup bits, but don't update the reserved data space. So we have: 596K 616K | | Qgroup reserved data space: 20K That number doesn't match the qgroup bit range anymore. 4. Dio prepare space for range [596K, 700K) Qgroup reserved data space for that range, we got: 596K 616K 700K |///////////////|///////////////////////| Qgroup reserved data space: 20K + 104K = 124K 5. btrfs_finish_ordered_range() gets executed for range [596K, 616K) Qgroup free reserved space for that range, we got: 596K 616K 700K | |///////////////////////| We need to free that range of reserved space. Qgroup reserved data space: 124K - 20K = 104K 6. btrfs_finish_ordered_range() gets executed for range [596K, 700K) However qgroup bit for range [596K, 616K) is already cleared in previous step, so we only free 84K for qgroup reserved space. 596K 616K 700K | | | We need to free that range of reserved space. Qgroup reserved data space: 104K - 84K = 20K Now there is no way to release that 20K unless disabling qgroup or unmounting the fs. [FIX] This patch will change the timing of btrfs_qgroup_release/free_data() call. Here it uses buffered COW write as an example. The new timing | The old timing ----------------------------------------+--------------------------------------- btrfs_buffered_write() | btrfs_buffered_write() |- btrfs_qgroup_reserve_data() | |- btrfs_qgroup_reserve_data() | btrfs_run_delalloc_range() | btrfs_run_delalloc_range() |- btrfs_add_ordered_extent() | |- btrfs_qgroup_release_data() | The reserved is passed into | btrfs_ordered_extent structure | | btrfs_finish_ordered_io() | btrfs_finish_ordered_io() |- The reserved space is passed to | |- btrfs_qgroup_release_data() btrfs_qgroup_record | The resereved space is passed | to btrfs_qgroup_recrod | btrfs_qgroup_account_extents() | btrfs_qgroup_account_extents() |- btrfs_qgroup_free_refroot() | |- btrfs_qgroup_free_refroot() The point of such change is to ensure, when ordered extents are submitted, the qgroup reserved space is already released, to keep the timing aligned with file_write_and_wait_range(). So that qgroup data reserved space is all bound to btrfs_ordered_extent and solve the timing mismatch. Fixes: f695fdcef83a ("btrfs: qgroup: Introduce functions to release/free qgroup reserve data space") Suggested-by: Josef Bacik Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 15 +-------------- fs/btrfs/ordered-data.c | 22 +++++++++++++++++++++- fs/btrfs/ordered-data.h | 3 +++ 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 516192eccf52..cdd9872b66f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2561,7 +2561,6 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_file_extent_item stack_fi; u64 logical_len; - int ret; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); @@ -2577,11 +2576,8 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - ret = btrfs_qgroup_release_data(inode, oe->file_offset, logical_len); - if (ret < 0) - return ret; return insert_reserved_file_extent(trans, inode, oe->file_offset, - &stack_fi, ret); + &stack_fi, oe->qgroup_rsv); } /* @@ -2636,13 +2632,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ - /* - * For mwrite(mmap + memset to write) case, we still reserve - * space for NOCOW range. - * As NOCOW won't cause a new delayed ref, just free the space - */ - btrfs_qgroup_free_data(inode, NULL, start, - ordered_extent->num_bytes); btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -2679,8 +2668,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - btrfs_qgroup_free_data(inode, NULL, start, - ordered_extent->num_bytes); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e13b3d28c063..c8bd7a4e67bb 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -15,6 +15,7 @@ #include "disk-io.h" #include "compression.h" #include "delalloc-space.h" +#include "qgroup.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -152,7 +153,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* allocate and add a new ordered_extent into the per-inode tree. +/* + * Allocate and add a new ordered_extent into the per-inode tree. * * The tree is given a single reference on the ordered extent that was * inserted. @@ -167,7 +169,24 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; + int ret; + if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { + /* For nocow write, we can release the qgroup rsv right now */ + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, + num_bytes); + if (ret < 0) + return ret; + ret = 0; + } else { + /* + * The ordered extent has reserved qgroup space, release now + * and pass the reserved number for qgroup_record to free. + */ + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + if (ret < 0) + return ret; + } tree = &BTRFS_I(inode)->ordered_tree; entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) @@ -181,6 +200,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->inode = igrab(inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; + entry->qgroup_rsv = ret; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c01c9698250b..4a506c5598f8 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -92,6 +92,9 @@ struct btrfs_ordered_extent { /* compression algorithm */ int compress_type; + /* Qgroup reserved space */ + int qgroup_rsv; + /* reference count */ refcount_t refs; From 5958253cf65de42493f17f36877a901486a90365 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 10 Jun 2020 09:04:44 +0800 Subject: [PATCH 026/151] btrfs: qgroup: catch reserved space leaks at unmount time Before this patch, qgroup completely relies on per-inode extent io tree to detect reserved data space leak. However previous bug has already shown how release page before btrfs_finish_ordered_io() could lead to leak, and since it's QGROUP_RESERVED bit cleared without triggering qgroup rsv, it can't be detected by per-inode extent io tree. So this patch adds another (and hopefully the final) safety net to catch qgroup data reserved space leak. At least the new safety net catches all the leaks during development, so it should be pretty useful in the real world. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +++++ fs/btrfs/qgroup.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/qgroup.h | 1 + 3 files changed, 49 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1a148058773..a9cf6152d175 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4058,6 +4058,11 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) ASSERT(list_empty(&fs_info->delayed_iputs)); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); + if (btrfs_check_quota_leak(fs_info)) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err(fs_info, "qgroup reserved space leaked"); + } + btrfs_free_qgroup_config(fs_info); ASSERT(list_empty(&fs_info->delalloc_roots)); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5bd4089ad0e1..74eb98479109 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -505,6 +505,49 @@ out: return ret < 0 ? ret : 0; } +static u64 btrfs_qgroup_subvolid(u64 qgroupid) +{ + return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); +} + +/* + * Called in close_ctree() when quota is still enabled. This verifies we don't + * leak some reserved space. + * + * Return false if no reserved space is left. + * Return true if some reserved space is leaked. + */ +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) +{ + struct rb_node *node; + bool ret = false; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return ret; + /* + * Since we're unmounting, there is no race and no need to grab qgroup + * lock. And here we don't go post-order to provide a more user + * friendly sorted result. + */ + for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { + struct btrfs_qgroup *qgroup; + int i; + + qgroup = rb_entry(node, struct btrfs_qgroup, node); + for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { + if (qgroup->rsv.values[i]) { + ret = true; + btrfs_warn(fs_info, + "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid), + i, qgroup->rsv.values[i]); + } + } + } + return ret; +} + /* * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), * first two are in single-threaded paths.And for the third one, we have set diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 1bc654459469..3be5198a3719 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -415,5 +415,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); +bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); #endif From ce6ef5abe68251cb60c4585df2cc73e218ec0dba Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 8 Jun 2020 16:06:07 +0200 Subject: [PATCH 027/151] btrfs: add little-endian optimized key helpers The CPU and on-disk keys are mapped to two different structures because of the endianness. There's an intermediate buffer used to do the conversion, but this is not necessary when CPU and on-disk endianness match. Add optimized versions of helpers that take disk_key and use the buffer directly for CPU keys or drop the intermediate buffer and conversion. This saves a lot of stack space accross many functions and removes about 6K of generated binary code: text data bss dec hex filename 1090439 17468 14912 1122819 112203 pre/btrfs.ko 1084613 17456 14912 1116981 110b35 post/btrfs.ko Delta: -5826 Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 17 +++++++++++++++++ fs/btrfs/ctree.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 82ab6e5a386d..70e49d8d4f6c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1501,6 +1501,22 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize) return 0; } +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static int comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + /* * compare two keys in a memcmp fashion */ @@ -1513,6 +1529,7 @@ static int comp_keys(const struct btrfs_disk_key *disk, return btrfs_comp_cpu_keys(&k1, k2); } +#endif /* * same as comp_keys only with two btrfs_key's diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 91a6c2b34c60..9138c40be755 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1895,6 +1895,52 @@ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, const struct btrfs_disk_key *disk) { @@ -1936,6 +1982,8 @@ static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, btrfs_disk_key_to_cpu(key, &disk_key); } +#endif + /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, From cd8d39f4aeb3b69af2478112366584256248eb2b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 10:36:48 +0100 Subject: [PATCH 028/151] btrfs: remove no longer used log_list member of struct btrfs_ordered_extent The 'log_list' member of an ordered extent was used keep track of which ordered extents we needed to wait after logging metadata, but is not used anymore since commit 5636cf7d6dc86f ("btrfs: remove the logged extents infrastructure"), as we now always wait on ordered extent completion before logging metadata. So just remove it since it's doing nothing and making each ordered extent structure waste more memory (2 pointers). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 2 -- fs/btrfs/ordered-data.h | 3 --- 2 files changed, 5 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c8bd7a4e67bb..fd902996a1db 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -217,7 +217,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -449,7 +448,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { - ASSERT(list_empty(&entry->log_list)); ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4a506c5598f8..435f93c46c32 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -104,9 +104,6 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; - /* If we need to wait on this to be done */ - struct list_head log_list; - /* If the transaction needs to wait on this ordered extent */ struct list_head trans_list; From 3ef64143a7963fd882ab52fee1cc1c9ba2e408e0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Jun 2020 10:36:58 +0100 Subject: [PATCH 029/151] btrfs: remove no longer used trans_list member of struct btrfs_ordered_extent The 'trans_list' member of an ordered extent was used to keep track of the ordered extents for which a transaction commit had to wait. These were ordered extents that were started and logged by an fsync. However we don't do that anymore and before we stopped doing it we changed the approach to wait for the ordered extents in commit 161c3549b45aee ("Btrfs: change how we wait for pending ordered extents"), which stopped using that list and therefore the 'trans_list' member is not used anymore since that commit. So just remove it since it's doing nothing and making each ordered extent structure waste memory (2 pointers). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 2 -- fs/btrfs/ordered-data.h | 3 --- 2 files changed, 5 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fd902996a1db..933efe52ec2c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -217,7 +217,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - INIT_LIST_HEAD(&entry->trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -448,7 +447,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { - ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 435f93c46c32..a24a1f2d5f9d 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -104,9 +104,6 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; - /* If the transaction needs to wait on this ordered extent */ - struct list_head trans_list; - /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; From da69fea9f785d8460059dd1ff8d654cea22e26df Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:01 +0300 Subject: [PATCH 030/151] btrfs: make __btrfs_add_ordered_extent take struct btrfs_inode This is internal btrfs function what really needs the vfs_inode only for igrab and a tracepoint. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 933efe52ec2c..dafa977bd6a2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -159,21 +159,21 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, +static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type, int dio, int compress_type) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ordered_inode_tree *tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry; int ret; if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, + ret = btrfs_qgroup_free_data(&inode->vfs_inode, NULL, file_offset, num_bytes); if (ret < 0) return ret; @@ -183,11 +183,11 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + ret = btrfs_qgroup_release_data(&inode->vfs_inode, file_offset, + num_bytes); if (ret < 0) return ret; } - tree = &BTRFS_I(inode)->ordered_tree; entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); if (!entry) return -ENOMEM; @@ -197,7 +197,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->num_bytes = num_bytes; entry->disk_num_bytes = disk_num_bytes; entry->bytes_left = num_bytes; - entry->inode = igrab(inode); + entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; @@ -218,7 +218,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(inode, entry); + trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -246,9 +246,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, * that work has been done at higher layers, so this is truly the * smallest the extent is going to get. */ - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); return 0; } @@ -257,7 +257,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + return __btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); } @@ -266,7 +266,7 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + return __btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); } @@ -276,7 +276,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, u64 disk_num_bytes, int type, int compress_type) { - return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + return __btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, compress_type); } From 43c69849ae78d83adc2a9ed077bc4c6353b09bc5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:02 +0300 Subject: [PATCH 031/151] btrfs: make get_extent_allocation_hint take btrfs_inode It doesn't use the vfs inode for anything, can just as easily take btrfs_inode. Follow up patches will convert callers as well. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cdd9872b66f8..690549dc4073 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -929,10 +929,10 @@ out_free: goto again; } -static u64 get_extent_allocation_hint(struct inode *inode, u64 start, +static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; @@ -1032,7 +1032,8 @@ static noinline int cow_file_range(struct inode *inode, } } - alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + alloc_hint = get_extent_allocation_hint(BTRFS_I(inode), start, + num_bytes); btrfs_drop_extent_cache(BTRFS_I(inode), start, start + num_bytes - 1, 0); @@ -6893,7 +6894,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 alloc_hint; int ret; - alloc_hint = get_extent_allocation_hint(inode, start, len); + alloc_hint = get_extent_allocation_hint(BTRFS_I(inode), start, len); ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) From c3504372699bff6daeda207b4e30256c39f584c1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:03 +0300 Subject: [PATCH 032/151] btrfs: make btrfs_lookup_ordered_extent take btrfs_inode It doesn't use the generic vfs inode for anything use btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 5 +++-- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 6 +++--- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/relocation.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 706a3128e192..9d311e834b20 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -522,10 +522,11 @@ fail: * means this bio can contains potentially discontigous bio vecs * so the logical offset of each should be calculated separately. */ -blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *vfsinode, struct bio *bio, u64 file_start, int contig) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(vfsinode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 690549dc4073..bd69367452c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4558,7 +4558,7 @@ again: lock_extent_bits(io_tree, block_start, block_end, &cached_state); set_page_extent_mapped(page); - ordered = btrfs_lookup_ordered_extent(inode, block_start); + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), block_start); if (ordered) { unlock_extent_cached(io_tree, block_start, block_end, &cached_state); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8f7c5f00894..b3e4c632d80c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1265,7 +1265,7 @@ again: while (1) { lock_extent_bits(tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_extent(inode, + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), page_start); unlock_extent_cached(tree, page_start, page_end, &cached_state); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dafa977bd6a2..cb737729f610 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -714,14 +714,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) * find an ordered extent corresponding to file_offset. return NULL if * nothing is found, otherwise take a reference on the extent and return it */ -struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) @@ -819,7 +819,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(inode, offset); + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); if (!ordered) return 0; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index a24a1f2d5f9d..f2a78f8f6bce 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -163,7 +163,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); -struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 11d156995446..8051aec5378e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3880,7 +3880,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) u64 new_bytenr; LIST_HEAD(list); - ordered = btrfs_lookup_ordered_extent(inode, file_pos); + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_pos); BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; From 7bfa9535019b1ca0696d0a0590a3fd657224ae2f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:04 +0300 Subject: [PATCH 033/151] btrfs: make btrfs_reloc_clone_csums take btrfs_inode It really wants btrfs_inode and not a vfs inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/relocation.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9138c40be755..8fd063ca081b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3430,7 +3430,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bd69367452c8..ba7d2043fb96 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1085,7 +1085,7 @@ static noinline int cow_file_range(struct inode *inode, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_reloc_clone_csums(inode, start, + ret = btrfs_reloc_clone_csums(BTRFS_I(inode), start, cur_alloc_size); /* * Only drop cache here, and process as normal. @@ -1743,7 +1743,7 @@ out_check: * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ - ret = btrfs_reloc_clone_csums(inode, cur_offset, + ret = btrfs_reloc_clone_csums(BTRFS_I(inode), cur_offset, num_bytes); extent_clear_unlock_delalloc(inode, cur_offset, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8051aec5378e..9235c671bef8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3870,9 +3870,9 @@ out: * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered; int ret; @@ -3880,10 +3880,10 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) u64 new_bytenr; LIST_HEAD(list); - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_pos); + ordered = btrfs_lookup_ordered_extent(inode, file_pos); BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + disk_bytenr = file_pos + inode->index_cnt; ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); if (ret) From 4b67c11dd19cd6443944d888b027017bb7872514 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:05 +0300 Subject: [PATCH 034/151] btrfs: make create_io_em take btrfs_inode It really wants a btrfs_inode and will allow submit_compressed_extents to be completely converted to btrfs_inode in follow up patches. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ba7d2043fb96..7015bf1a65c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,8 +84,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, - u64 orig_start, u64 block_start, +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); @@ -845,7 +845,7 @@ retry: * here we're doing allocation and writeback of the * compressed pages */ - em = create_io_em(inode, async_extent->start, + em = create_io_em(BTRFS_I(inode), async_extent->start, async_extent->ram_size, /* len */ async_extent->start, /* orig_start */ ins.objectid, /* block_start */ @@ -1064,7 +1064,7 @@ static noinline int cow_file_range(struct inode *inode, extent_reserved = true; ram_size = ins.offset; - em = create_io_em(inode, start, ins.offset, /* len */ + em = create_io_em(BTRFS_I(inode), start, ins.offset, /* len */ start, /* orig_start */ ins.objectid, /* block_start */ ins.offset, /* block_len */ @@ -1700,7 +1700,7 @@ out_check: u64 orig_start = found_key.offset - extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, num_bytes, + em = create_io_em(BTRFS_I(inode), cur_offset, num_bytes, orig_start, disk_bytenr, /* block_start */ num_bytes, /* block_len */ @@ -6861,7 +6861,7 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, int ret; if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(inode, start, len, orig_start, + em = create_io_em(BTRFS_I(inode), start, len, orig_start, block_start, block_len, orig_block_len, ram_bytes, BTRFS_COMPRESS_NONE, /* compress_type */ @@ -7140,8 +7140,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, } /* The callers of this must take lock_extent() */ -static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, - u64 orig_start, u64 block_start, +static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type) @@ -7155,7 +7155,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); - em_tree = &BTRFS_I(inode)->extent_tree; + em_tree = &inode->extent_tree; em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7177,8 +7177,8 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, } do { - btrfs_drop_extent_cache(BTRFS_I(inode), em->start, - em->start + em->len - 1, 0); + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); From ad7ff17b65a0567b826c88009d3ea080431816c3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:06 +0300 Subject: [PATCH 035/151] btrfs: make extent_clear_unlock_delalloc take btrfs_inode It has one VFS and 1 btrfs inode usages but converting it to btrfs_inode interface will allow seamless conversion of its callers. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 29 ++++++++++++++++------------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60278e52c37a..1b18960c9c5d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2018,15 +2018,14 @@ out: return err; } -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, unsigned clear_bits, unsigned long page_ops) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, - NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); - __process_pages_contig(inode->i_mapping, locked_page, + __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start >> PAGE_SHIFT, end >> PAGE_SHIFT, page_ops, NULL); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 87f60a48f750..31c5a6aabd75 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -277,7 +277,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7015bf1a65c0..cab56c37ec39 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -642,7 +642,8 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + NULL, clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | @@ -878,7 +879,7 @@ retry: /* * clear dirty, set writeback and unlock the pages. */ - extent_clear_unlock_delalloc(inode, async_extent->start, + extent_clear_unlock_delalloc(BTRFS_I(inode), async_extent->start, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, @@ -900,7 +901,7 @@ retry: btrfs_writepage_endio_finish_ordered(p, start, end, 0); p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); @@ -915,7 +916,7 @@ out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: - extent_clear_unlock_delalloc(inode, async_extent->start, + extent_clear_unlock_delalloc(BTRFS_I(inode), async_extent->start, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -1017,7 +1018,8 @@ static noinline int cow_file_range(struct inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1115,7 +1117,7 @@ static noinline int cow_file_range(struct inode *inode, page_ops = unlock ? PAGE_UNLOCK : 0; page_ops |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(inode, start, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, start + ram_size - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, @@ -1160,7 +1162,7 @@ out_unlock: * it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { - extent_clear_unlock_delalloc(inode, start, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, start + cur_alloc_size - 1, locked_page, clear_bits, @@ -1169,7 +1171,7 @@ out_unlock: if (start >= end) goto out; } - extent_clear_unlock_delalloc(inode, start, end, locked_page, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; @@ -1277,8 +1279,8 @@ static int cow_file_range_async(struct inode *inode, PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR; - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + locked_page, clear_bits, page_ops); return -ENOMEM; } @@ -1468,7 +1470,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, locked_page, + extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | @@ -1746,7 +1749,7 @@ out_check: ret = btrfs_reloc_clone_csums(BTRFS_I(inode), cur_offset, num_bytes); - extent_clear_unlock_delalloc(inode, cur_offset, + extent_clear_unlock_delalloc(BTRFS_I(inode), cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -1783,7 +1786,7 @@ error: btrfs_dec_nocow_writers(fs_info, disk_bytenr); if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, cur_offset, end, + extent_clear_unlock_delalloc(BTRFS_I(inode), cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | From bd242a08a690e98d9e9eb7ab51580d4a86b76c6c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:07 +0300 Subject: [PATCH 036/151] btrfs: make btrfs_csum_one_bio takae btrfs_inode Will enable converting btrfs_submit_compressed_write to btrfs_inode more easily. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 5 +++-- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/file-item.c | 3 +-- fs/btrfs/inode.c | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c6e648603f85..4f52cd8af517 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -475,7 +475,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, + start, 1); BUG_ON(ret); /* -ENOMEM */ } @@ -507,7 +508,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, start, 1); BUG_ON(ret); /* -ENOMEM */ } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fd063ca081b..37db9753fc84 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2883,8 +2883,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - u64 file_start, int contig); +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 9d311e834b20..7d5ec71615b8 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -522,10 +522,9 @@ fail: * means this bio can contains potentially discontigous bio vecs * so the logical offset of each should be calculated separately. */ -blk_status_t btrfs_csum_one_bio(struct inode *vfsinode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, u64 file_start, int contig) { - struct btrfs_inode *inode = BTRFS_I(vfsinode); struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cab56c37ec39..d0af2027f898 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2167,7 +2167,7 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, struct inode *inode = private_data; blk_status_t ret = 0; - ret = btrfs_csum_one_bio(inode, bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -2232,7 +2232,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, 0, inode, btrfs_submit_bio_start); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, 0, 0); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); if (ret) goto out; } @@ -7572,7 +7572,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, { struct inode *inode = private_data; blk_status_t ret; - ret = btrfs_csum_one_bio(inode, bio, offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -7633,7 +7633,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(inode, bio, file_offset, 1); + ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1); if (ret) goto err; } else { From 906c448c3dc3189d83bf644ec453d49737371b00 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:08 +0300 Subject: [PATCH 037/151] btrfs: make __btrfs_drop_extents take btrfs_inode It has only 4 uses of a vfs_inode for inode_sub_bytes but unifies the interface with the non __ prefixed version. Will also makes converting its callers to btrfs_inode easier. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 23 ++++++++++++----------- fs/btrfs/inode.c | 4 ++-- fs/btrfs/tree-log.c | 2 +- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 37db9753fc84..a9cf11f2e8c7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3014,7 +3014,7 @@ void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, u64 start, u64 end, u64 *drop_end, int drop_cache, int replace_extent, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b15858e1d753..4e530c9ac3ef 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -731,7 +731,7 @@ next: * is deleted from the tree. */ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, + struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, u64 start, u64 end, u64 *drop_end, int drop_cache, int replace_extent, @@ -744,7 +744,8 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct inode *vfs_inode = &inode->vfs_inode; + u64 ino = btrfs_ino(inode); u64 search_start = start; u64 disk_bytenr = 0; u64 num_bytes = 0; @@ -762,9 +763,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int leafs_visited = 0; if (drop_cache) - btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0); + btrfs_drop_extent_cache(inode, start, end - 1, 0); - if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) + if (start >= inode->disk_i_size && !replace_extent) modify_tree = 0; update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || @@ -935,7 +936,7 @@ next_slot: extent_end - end); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(inode, end - key.offset); + inode_sub_bytes(vfs_inode, end - key.offset); break; } @@ -955,7 +956,7 @@ next_slot: start - key.offset); btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) - inode_sub_bytes(inode, extent_end - start); + inode_sub_bytes(vfs_inode, extent_end - start); if (end == extent_end) break; @@ -979,7 +980,7 @@ delete_extent_item: if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { - inode_sub_bytes(inode, + inode_sub_bytes(vfs_inode, extent_end - key.offset); extent_end = ALIGN(extent_end, fs_info->sectorsize); @@ -993,7 +994,7 @@ delete_extent_item: key.offset - extent_offset); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ - inode_sub_bytes(inode, + inode_sub_bytes(vfs_inode, extent_end - key.offset); } @@ -1082,8 +1083,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, - drop_cache, 0, 0, NULL); + ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start, + end, NULL, drop_cache, 0, 0, NULL); btrfs_free_path(path); return ret; } @@ -2596,7 +2597,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, cur_offset = start; while (cur_offset < end) { - ret = __btrfs_drop_extents(trans, root, inode, path, + ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, cur_offset, end + 1, &drop_end, 1, 0, 0, NULL); if (ret != -ENOSPC) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d0af2027f898..6f8c7d65c428 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -323,7 +323,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, inode, path, + ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start, aligned_end, NULL, 1, 1, extent_item_size, &extent_inserted); if (ret) { @@ -2498,7 +2498,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, + ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, file_pos, file_pos + num_bytes, NULL, 0, 1, sizeof(*stack_fi), &extent_inserted); if (ret) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cd5348f352dd..df6d4e3e40b1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4151,7 +4151,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start, + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, em->start + em->len, NULL, 0, 1, sizeof(*fi), &extent_inserted); if (ret) From c171edd5c8e164bbcb46c61016ef4e88c8130edf Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 17 Jun 2020 12:10:42 +0300 Subject: [PATCH 038/151] btrfs: remove hole check in prealloc_file_extent_cluster Extents in the extent cluster are guaranteed to be contiguous as such the hole check inside the loop can never trigger. In fact this check was never functional since it was added in 18513091af94 ("btrfs: update btrfs_space_info's bytes_may_use timely") which came after the commit introducing clustered/contiguous extents 0257bb82d21b ("Btrfs: relocate file extents in clusters"). Let's just remove it as it adds noise to the source. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9235c671bef8..ade45fb4c49a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2604,9 +2604,6 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; - if (cur_offset < start) - btrfs_free_reserved_data_space_noquota(inode, - start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); From 214e61d07e1ab9e497c081d1bad78d05f7c33abf Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 17 Jun 2020 12:10:43 +0300 Subject: [PATCH 039/151] btrfs: perform data management operations outside of inode lock btrfs_alloc_data_chunk_ondemand and btrfs_free_reserved_data_space_noquota don't really use the guts of the inodes being passed to them. This implies it's not required to call them under extent lock. Move code around in prealloc_file_extent_cluster to do the heavy, data alloc/free operations outside of the lock. This also makes the 'out' label unnecessary, so remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ade45fb4c49a..f7feada38d19 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2584,17 +2584,15 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; - u64 cur_offset; + u64 cur_offset = prealloc_start; BUG_ON(cluster->start != cluster->boundary[0]); - inode_lock(inode); - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), prealloc_end + 1 - prealloc_start); if (ret) - goto out; + return ret; - cur_offset = prealloc_start; + inode_lock(inode); while (nr < cluster->nr) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -2613,11 +2611,11 @@ int prealloc_file_extent_cluster(struct inode *inode, break; nr++; } + inode_unlock(inode); + if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode, prealloc_end + 1 - cur_offset); -out: - inode_unlock(inode); return ret; } From 4e9d0d0109d6b7ababf4d121168b0e97e4ae2f9d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 17 Jun 2020 12:10:44 +0300 Subject: [PATCH 040/151] btrfs: use for loop in prealloc_file_extent_cluster This function iterates all extents in the extent cluster, make this intention obvious by using a for loop. No functional chanes. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f7feada38d19..e3f3e2e70212 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2580,7 +2580,7 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 end; u64 offset = BTRFS_I(inode)->index_cnt; u64 num_bytes; - int nr = 0; + int nr; int ret = 0; u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; @@ -2593,7 +2593,7 @@ int prealloc_file_extent_cluster(struct inode *inode, return ret; inode_lock(inode); - while (nr < cluster->nr) { + for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; @@ -2609,7 +2609,6 @@ int prealloc_file_extent_cluster(struct inode *inode, unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; - nr++; } inode_unlock(inode); From 923eb5236597c90b11114926025e0c04e4c1da32 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 23 Jun 2020 17:40:07 +0900 Subject: [PATCH 041/151] btrfs: use free_root_extent_buffer to free root In btrfs_put_root() we're freeing a btrfs_root's 'node' and 'commit_root' extent buffers manually via kfree(), while we're using free_root_extent_buffers() in the free_root_pointers() function above. free_root_extent_buffers() also NULLs the pointers after freeing, which mitigates potential double frees. Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9cf6152d175..b5b7055b7953 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2001,8 +2001,7 @@ void btrfs_put_root(struct btrfs_root *root) if (root->anon_dev) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); + free_root_extent_buffers(root); kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); #ifdef CONFIG_BTRFS_DEBUG From 5af9d6ef3f6fe62c60f23ef3fd437e0843bea679 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 22 Jun 2020 23:18:41 +0300 Subject: [PATCH 042/151] btrfs: tests: remove if duplicate in __check_free_space_extents() num_extents is already checked in the next if condition and can be safely removed. Signed-off-by: Denis Efremov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/free-space-tree-tests.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 914eea5ba6a7..2c783d2f5228 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -60,8 +60,6 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { - if (i >= num_extents) - goto invalid; if (i >= num_extents || extent_start != extents[i].start || offset - extent_start != extents[i].length) From bab16e21e8bbd644067289cfa328f8a67f3e333d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 23 Jun 2020 20:56:12 +0200 Subject: [PATCH 043/151] btrfs: don't use UAPI types for fiemap callback The fiemap callback is not part of UAPI interface and the prototypes don't have the __u64 types either. Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b18960c9c5d..e3078ed2261b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4669,7 +4669,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, } int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) + u64 start, u64 len) { int ret = 0; u64 off = start; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 31c5a6aabd75..00a88f2eb5ab 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -204,7 +204,7 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len); + u64 start, u64 len); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6f8c7d65c428..e5feefa20b28 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7946,7 +7946,7 @@ out: } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) + u64 start, u64 len) { int ret; From a2570ef330b959eb37e3a437b9884d9cc4b6a3d9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 23 Jun 2020 21:23:54 +0200 Subject: [PATCH 044/151] btrfs: remove unused btrfs_root::defrag_trans_start Last touched in 2013 by commit de78b51a2852 ("btrfs: remove cache only arguments from defrag path") that was the only code that used the value. Now it's only set but never used for anything, so we can remove it. Reviewed-by: Nikolay Borisov Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 4 ---- fs/btrfs/tree-defrag.c | 5 ++--- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9cf11f2e8c7..033e411dd811 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1075,7 +1075,6 @@ struct btrfs_root { u64 highest_objectid; - u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b5b7055b7953..dbf90cd49513 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1141,10 +1141,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (!dummy) - root->defrag_trans_start = fs_info->generation; - else - root->defrag_trans_start = 0; root->root_key.objectid = objectid; root->anon_dev = 0; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 16c3a6d2586d..d3f28b8f4ff9 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -133,10 +133,9 @@ out: ret = 0; } done: - if (ret != -EAGAIN) { + if (ret != -EAGAIN) memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - root->defrag_trans_start = trans->transid; - } + return ret; } From b547a88ea5776a8092f7f122ddc20d6720528782 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 18 Jun 2020 14:54:56 +0200 Subject: [PATCH 045/151] btrfs: start deprecation of mount option inode_cache Estimated time of removal of the functionality is 5.11, the option will be still parsed but will have no effect. Reasons for deprecation and removal: - very poor naming choice of the mount option, it's supposed to cache and reuse the inode _numbers_, but it sounds a some generic cache for inodes - the only known usecase where this option would make sense is on a 32bit architecture where inode numbers in one subvolume would be exhausted due to 32bit inode::i_ino - the cache is stored on disk, consumes space, needs to be loaded and written back - new inode number allocation is slower due to lookups into the cache (compared to a simple increment which is the default) - uses the free-space-cache code that is going to be deprecated as well in the future Known problems: - since 2011, returning EEXIST when there's not enough space in a page to store all checksums, see commit 4b9465cb9e38 ("Btrfs: add mount -o inode_cache") Remaining issues: - if the option was enabled, new inodes created, the option disabled again, the cache is still stored on the devices and there's currently no way to remove it Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 76ab6d5d01a9..dac04e28643e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -821,6 +821,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } break; case Opt_inode_cache: + btrfs_warn(info, + "the 'inode_cache' option is deprecated and will have no effect from 5.11"); btrfs_set_pending_and_info(info, INODE_MAP_CACHE, "enabling inode map caching"); break; From 6d4572a9d71d5fc2affee0258d8582d39859188c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 24 Jun 2020 07:23:50 +0800 Subject: [PATCH 046/151] btrfs: allow btrfs_truncate_block() to fallback to nocow for data space reservation [BUG] When the data space is exhausted, even if the inode has NOCOW attribute, we will still refuse to truncate unaligned range due to ENOSPC. The following script can reproduce it pretty easily: #!/bin/bash dev=/dev/test/test mnt=/mnt/btrfs umount $dev &> /dev/null umount $mnt &> /dev/null mkfs.btrfs -f $dev -b 1G mount -o nospace_cache $dev $mnt touch $mnt/foobar chattr +C $mnt/foobar xfs_io -f -c "pwrite -b 4k 0 4k" $mnt/foobar > /dev/null xfs_io -f -c "pwrite -b 4k 0 1G" $mnt/padding &> /dev/null sync xfs_io -c "fpunch 0 2k" $mnt/foobar umount $mnt Currently this will fail at the fpunch part. [CAUSE] Because btrfs_truncate_block() always reserves space without checking the NOCOW attribute. Since the writeback path follows NOCOW bit, we only need to bother the space reservation code in btrfs_truncate_block(). [FIX] Make btrfs_truncate_block() follow btrfs_buffered_write() to try to reserve data space first, and fall back to NOCOW check only when we don't have enough space. Such always-try-reserve is an optimization introduced in btrfs_buffered_write(), to avoid expensive btrfs_check_can_nocow() call. This patch will export check_can_nocow() as btrfs_check_can_nocow(), and use it in btrfs_truncate_block() to fix the problem. Reported-by: Martin Doucha Reviewed-by: Filipe Manana Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/file.c | 12 ++++++------ fs/btrfs/inode.c | 44 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 033e411dd811..75dfb9b4d9e3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3033,6 +3033,8 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4e530c9ac3ef..7aa184493aea 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1533,8 +1533,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1649,8 +1649,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (ret < 0) { if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && - check_can_nocow(BTRFS_I(inode), pos, - &write_bytes, false) > 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, + &write_bytes, false) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1927,8 +1927,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, - true) <= 0) { + btrfs_check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, + true) <= 0) { inode_unlock(inode); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e5feefa20b28..a0388b21c5cc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4512,11 +4512,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; char *kaddr; + bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); + size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; @@ -4528,11 +4530,27 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - block_start, blocksize); - if (ret) - goto out; + ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, + blocksize); + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + btrfs_check_can_nocow(BTRFS_I(inode), block_start, + &write_bytes, false) > 0) { + /* For nocow case, no need to reserve data space */ + only_release_metadata = true; + } else { + goto out; + } + } + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); + if (ret < 0) { + if (!only_release_metadata) + btrfs_free_reserved_data_space(inode, data_reserved, + block_start, blocksize); + goto out; + } again: page = find_or_create_page(mapping, index, mask); if (!page) { @@ -4601,14 +4619,26 @@ again: set_page_dirty(page); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); + if (only_release_metadata) + set_extent_bit(&BTRFS_I(inode)->io_tree, block_start, + block_end, EXTENT_NORESERVE, NULL, NULL, + GFP_NOFS); + out_unlock: - if (ret) - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (ret) { + if (only_release_metadata) + btrfs_delalloc_release_metadata(BTRFS_I(inode), + blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: + if (only_release_metadata) + btrfs_drew_write_unlock(&BTRFS_I(inode)->root->snapshot_lock); extent_changeset_free(data_reserved); return ret; } From e4ecaf90bc13e2a9c351d5cd86d4094844d7d7bd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 24 Jun 2020 07:23:51 +0800 Subject: [PATCH 047/151] btrfs: add comments for btrfs_check_can_nocow() and can_nocow_extent() These two functions have extra conditions that their callers need to meet, and some not-that-common parameters used for return value. So adding some comments may save reviewers some time. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 21 +++++++++++++++++++++ fs/btrfs/inode.c | 21 +++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7aa184493aea..b750000b438a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1533,6 +1533,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset + * @write_bytes: The length to write, will be updated to the nocow writeable + * range + * @nowait: Whether this function could sleep + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks for (nowait == false) case. + * + * Return: + * >0 and update @write_bytes if we can do nocow write + * 0 if we can't do nocow write + * -EAGAIN if we can't get the needed lock or there are ordered extents + * for * (nowait == true) case + * <0 if other error happened + * + * NOTE: For wait (nowait == false) calls, callers need to release the drew + * write lock of inode->root->snapshot_lock when return value > 0. + */ int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0388b21c5cc..37c864e6b5bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6945,8 +6945,25 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, } /* - * returns 1 when the nocow is safe, < 1 on error, 0 if the - * block must be cow'd + * Check if we can do nocow write into the range [@offset, @offset + @len) + * + * @offset: File offset + * @len: The length to write, will be updated to the nocow writeable + * range + * @orig_start: (optional) Return the original file offset of the file extent + * @orig_len: (optional) Return the original on-disk length of the file extent + * @ram_bytes: (optional) Return the ram_bytes of the file extent + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks for (nowait == false) case. + * + * Return: + * >0 and update @len if we can do nocow write + * 0 if we can't do nocow write + * <0 if error happened + * + * NOTE: This only checks the file extents, caller is responsible to wait for + * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, From 38d37aa9c32938214ca071fe02762f55b89937fd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 24 Jun 2020 07:23:52 +0800 Subject: [PATCH 048/151] btrfs: refactor btrfs_check_can_nocow() into two variants The function btrfs_check_can_nocow() now has two completely different call patterns. For nowait variant, callers don't need to do any cleanup. While for wait variant, callers need to release the lock if they can do nocow write. This is somehow confusing, and is already a problem for the exported btrfs_check_can_nocow(). So this patch will separate the different patterns into different functions. For nowait variant, the function will be called check_nocow_nolock(). For wait variant, the function pair will be btrfs_check_nocow_lock() btrfs_check_nocow_unlock(). Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +-- fs/btrfs/file.c | 83 ++++++++++++++++++++++++++++-------------------- fs/btrfs/inode.c | 8 ++--- 3 files changed, 54 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 75dfb9b4d9e3..654b99af1587 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3033,8 +3033,9 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b750000b438a..760ddc11aa3f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1533,29 +1533,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -/* - * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) - * - * @pos: File offset - * @write_bytes: The length to write, will be updated to the nocow writeable - * range - * @nowait: Whether this function could sleep - * - * This function will flush ordered extents in the range to ensure proper - * nocow checks for (nowait == false) case. - * - * Return: - * >0 and update @write_bytes if we can do nocow write - * 0 if we can't do nocow write - * -EAGAIN if we can't get the needed lock or there are ordered extents - * for * (nowait == true) case - * <0 if other error happened - * - * NOTE: For wait (nowait == false) calls, callers need to release the drew - * write lock of inode->root->snapshot_lock when return value > 0. - */ -int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1563,6 +1542,9 @@ int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; + if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return 0; + if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; @@ -1605,6 +1587,42 @@ out_unlock: return ret; } +static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) +{ + return check_can_nocow(inode, pos, write_bytes, true); +} + +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset + * @write_bytes: The length to write, will be updated to the nocow writeable + * range + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * >0 and update @write_bytes if we can do nocow write + * 0 if we can't do nocow write + * -EAGAIN if we can't get the needed lock or there are ordered extents + * for * (nowait == true) case + * <0 if other error happened + * + * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) +{ + return check_can_nocow(inode, pos, write_bytes, false); +} + +void btrfs_check_nocow_unlock(struct btrfs_inode *inode) +{ + btrfs_drew_write_unlock(&inode->root->snapshot_lock); +} + static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) { @@ -1612,7 +1630,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; @@ -1668,10 +1685,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, ret = btrfs_check_data_free_space(inode, &data_reserved, pos, write_bytes); if (ret < 0) { - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - btrfs_check_can_nocow(BTRFS_I(inode), pos, - &write_bytes, false) > 0) { + if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, + &write_bytes) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1700,7 +1715,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, data_reserved, pos, write_bytes); else - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); break; } @@ -1804,7 +1819,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); if (only_release_metadata && copied > 0) { lockstart = round_down(pos, @@ -1831,7 +1846,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { @@ -1946,10 +1961,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * We will allocate space in case nodatacow is not set, * so bail */ - if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) || - btrfs_check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, - true) <= 0) { + if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) + <= 0) { inode_unlock(inode); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 37c864e6b5bc..bd51365e53fb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4534,10 +4534,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize); if (ret < 0) { - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - btrfs_check_can_nocow(BTRFS_I(inode), block_start, - &write_bytes, false) > 0) { + if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start, + &write_bytes) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { @@ -4638,7 +4636,7 @@ out_unlock: put_page(page); out: if (only_release_metadata) - btrfs_drew_write_unlock(&BTRFS_I(inode)->root->snapshot_lock); + btrfs_check_nocow_unlock(BTRFS_I(inode)); extent_changeset_free(data_reserved); return ret; } From 3502a8c0dc1bd4b4970b59b06e348f22a1c05581 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 12:35:28 +0200 Subject: [PATCH 049/151] btrfs: allow use of global block reserve for balance item deletion On a filesystem with exhausted metadata, but still enough to start balance, it's possible to hit this error: [324402.053842] BTRFS info (device loop0): 1 enospc errors during balance [324402.060769] BTRFS info (device loop0): balance: ended with status: -28 [324402.172295] BTRFS: error (device loop0) in reset_balance_state:3321: errno=-28 No space left It fails inside reset_balance_state and turns the filesystem to read-only, which is unnecessary and should be fixed too, but the problem is caused by lack for space when the balance item is deleted. This is a one-time operation and from the same rank as unlink that is allowed to use the global block reserve. So do the same for the balance item. Status of the filesystem (100GiB) just after the balance fails: $ btrfs fi df mnt Data, single: total=80.01GiB, used=38.58GiB System, single: total=4.00MiB, used=16.00KiB Metadata, single: total=19.99GiB, used=19.48GiB GlobalReserve, single: total=512.00MiB, used=50.11MiB CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f403fb1e6d37..62ae89b078f4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3231,7 +3231,7 @@ static int del_balance_item(struct btrfs_fs_info *fs_info) if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); + trans = btrfs_start_transaction_fallback_global_rsv(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); From 45e31869cc4fa8d7c7d2b890965b1b3abf0cfd6d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 19 Jun 2020 15:24:46 +0300 Subject: [PATCH 050/151] btrfs: tracepoints: fix btrfs_trigger_flush symbolic string for flags When tracepoints use __print_symbolic to print textual representation of a value that comes from an ENUM each enum value needs to be exported to user space so that user space tools can convert the binary value data to the trings as user space does not know what those enums are about. Doing a trace-cmd record && trace-cmd report currently results in: kworker/u8:1-61 [000] 66.299527: btrfs_flush_space: 5302ee13-c65e-45bb-98ef-8fe3835bd943: state=3(0x3) flags=4(METADATA) num_bytes=2621440 ret=0 I.e state is not translated to its symbolic counterpart. With this patch applied the output is: fio-370 [002] 56.762402: btrfs_trigger_flush: d04cd7ac-38e2-452f-a7f5-8157529fd5f0: preempt: flush=3(BTRFS_RESERVE_FLUSH_ALL) flags=4(METADATA) bytes=655360 See also 190f0b76ca49 ("mm: tracing: Export enums in tracepoints to user space"). Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 360b0f9d2220..4ccd81fae385 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1042,11 +1042,33 @@ TRACE_EVENT(btrfs_space_reservation, __entry->bytes) ); -#define show_flush_action(action) \ - __print_symbolic(action, \ - { BTRFS_RESERVE_NO_FLUSH, "BTRFS_RESERVE_NO_FLUSH"}, \ - { BTRFS_RESERVE_FLUSH_LIMIT, "BTRFS_RESERVE_FLUSH_LIMIT"}, \ - { BTRFS_RESERVE_FLUSH_ALL, "BTRFS_RESERVE_FLUSH_ALL"}) +#define FLUSH_ACTIONS \ + EM( BTRFS_RESERVE_NO_FLUSH, "BTRFS_RESERVE_NO_FLUSH") \ + EM( BTRFS_RESERVE_FLUSH_LIMIT, "BTRFS_RESERVE_FLUSH_LIMIT") \ + EM( BTRFS_RESERVE_FLUSH_ALL, "BTRFS_RESERVE_FLUSH_ALL") \ + EMe(BTRFS_RESERVE_FLUSH_ALL_STEAL, "BTRFS_RESERVE_FLUSH_ALL_STEAL") + +/* + * First define the enums in the above macros to be exported to userspace via + * TRACE_DEFINE_ENUM(). + */ + +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +FLUSH_ACTIONS + +/* + * Now redefine the EM and EMe macros to map the enums to the strings that will + * be printed in the output + */ + +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} TRACE_EVENT(btrfs_trigger_flush, @@ -1071,7 +1093,7 @@ TRACE_EVENT(btrfs_trigger_flush, TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu", __get_str(reason), __entry->flush, - show_flush_action(__entry->flush), + __print_symbolic(__entry->flush, FLUSH_ACTIONS), __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), From 0840dd28b52f0ceba91265ec62bffdcedb82b4a9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 19 Jun 2020 15:24:47 +0300 Subject: [PATCH 051/151] btrfs: tracepoints: fix extent type symbolic name print extent's type is an enum and this requires that the enum values be exported to user space so that user space tools can correctly map raw binary data to the symbolic name. Currently tracepoints using btrfs__file_extent_item_regular or btrfs__file_extent_item_inline result in the following output: fio-443 [002] 586.609450: btrfs_get_extent_show_fi_regular: f0c3bf8e-0174-4bcc-92aa-6c2d62430420:i root=5(FS_TREE) inode=258 size=2136457216 disk_isize=0 file extent range=[2126946304 2136457216] (num_bytes=9510912 ram_bytes=9510912 disk_bytenr=0 disk_num_bytes=0 extent_offset=0 type=0x1 compression=0 E.g type is 0x1 . With this patch applie the output is: disk_bytenr=141348864 disk_num_bytes=4096 extent_offset=0 type=REG compression=0 Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4ccd81fae385..71401d4c3ccc 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -67,11 +67,22 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); (obj >= BTRFS_ROOT_TREE_OBJECTID && \ obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-" -#define show_fi_type(type) \ - __print_symbolic(type, \ - { BTRFS_FILE_EXTENT_INLINE, "INLINE" }, \ - { BTRFS_FILE_EXTENT_REG, "REG" }, \ - { BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC"}) +#define FI_TYPES \ + EM( BTRFS_FILE_EXTENT_INLINE, "INLINE") \ + EM( BTRFS_FILE_EXTENT_REG, "REG") \ + EMe(BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC") + +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +FI_TYPES + +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} #define show_qgroup_rsv_type(type) \ __print_symbolic(type, \ @@ -380,7 +391,7 @@ DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, __entry->disk_isize, __entry->extent_start, __entry->extent_end, __entry->num_bytes, __entry->ram_bytes, __entry->disk_bytenr, __entry->disk_num_bytes, - __entry->extent_offset, show_fi_type(__entry->extent_type), + __entry->extent_offset, __print_symbolic(__entry->extent_type, FI_TYPES), __entry->compression) ); @@ -421,7 +432,7 @@ DECLARE_EVENT_CLASS( "extent_type=%s compression=%u", show_root_type(__entry->root_obj), __entry->ino, __entry->isize, __entry->disk_isize, __entry->extent_start, - __entry->extent_end, show_fi_type(__entry->extent_type), + __entry->extent_end, __print_symbolic(__entry->extent_type, FI_TYPES), __entry->compression) ); From 5bca2c952c609b128b00a238fadb99cc0d3b65ab Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 19 Jun 2020 15:24:48 +0300 Subject: [PATCH 052/151] btrfs: tracepoints: move FLUSH_ACTIONS define Since all enums used in btrfs' tracepoints are going to be redefined to allow proper parsing of their values by userspace tools let's rearrange when they are defined. This will allow to use only a single set of #define EM/#undef EM sequence. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 45 ++++++++++++++---------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 71401d4c3ccc..c214622957c4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -67,18 +67,35 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); (obj >= BTRFS_ROOT_TREE_OBJECTID && \ obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-" +#define FLUSH_ACTIONS \ + EM( BTRFS_RESERVE_NO_FLUSH, "BTRFS_RESERVE_NO_FLUSH") \ + EM( BTRFS_RESERVE_FLUSH_LIMIT, "BTRFS_RESERVE_FLUSH_LIMIT") \ + EM( BTRFS_RESERVE_FLUSH_ALL, "BTRFS_RESERVE_FLUSH_ALL") \ + EMe(BTRFS_RESERVE_FLUSH_ALL_STEAL, "BTRFS_RESERVE_FLUSH_ALL_STEAL") + #define FI_TYPES \ EM( BTRFS_FILE_EXTENT_INLINE, "INLINE") \ EM( BTRFS_FILE_EXTENT_REG, "REG") \ EMe(BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC") +/* + * First define the enums in the above macros to be exported to userspace via + * TRACE_DEFINE_ENUM(). + */ + #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); +FLUSH_ACTIONS FI_TYPES +/* + * Now redefine the EM and EMe macros to map the enums to the strings that will + * be printed in the output + */ + #undef EM #undef EMe #define EM(a, b) {a, b}, @@ -1053,34 +1070,6 @@ TRACE_EVENT(btrfs_space_reservation, __entry->bytes) ); -#define FLUSH_ACTIONS \ - EM( BTRFS_RESERVE_NO_FLUSH, "BTRFS_RESERVE_NO_FLUSH") \ - EM( BTRFS_RESERVE_FLUSH_LIMIT, "BTRFS_RESERVE_FLUSH_LIMIT") \ - EM( BTRFS_RESERVE_FLUSH_ALL, "BTRFS_RESERVE_FLUSH_ALL") \ - EMe(BTRFS_RESERVE_FLUSH_ALL_STEAL, "BTRFS_RESERVE_FLUSH_ALL_STEAL") - -/* - * First define the enums in the above macros to be exported to userspace via - * TRACE_DEFINE_ENUM(). - */ - -#undef EM -#undef EMe -#define EM(a, b) TRACE_DEFINE_ENUM(a); -#define EMe(a, b) TRACE_DEFINE_ENUM(a); - -FLUSH_ACTIONS - -/* - * Now redefine the EM and EMe macros to map the enums to the strings that will - * be printed in the output - */ - -#undef EM -#undef EMe -#define EM(a, b) {a, b}, -#define EMe(a, b) {a, b} - TRACE_EVENT(btrfs_trigger_flush, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, From 1cb1f0b2486b0893a3ebf20c42f2df27649ae2b4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 19 Jun 2020 15:24:49 +0300 Subject: [PATCH 053/151] btrfs: tracepoints: fix qgroup reservation type printing Since qgroup's reservation types are define in a macro they must be exported to user space in order for user space tools to convert raw binary data to symbolic names. Currently trace-cmd report produces the following output: kworker/u8:2-459 [003] 1208.543587: qgroup_update_reserve: 2b742cae-e0e5-4def-9ef7-28a9b34a951e: qgid=5 type=0x2 cur_reserved=54870016 diff=-32768 With this fix the output is: kworker/u8:2-459 [003] 1208.543587: qgroup_update_reserve: 2b742cae-e0e5-4def-9ef7-28a9b34a951e: qgid=5 type=BTRFS_QGROUP_RSV_META_PREALLOC cur_reserved=54870016 diff=-32768 Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index c214622957c4..f0e95e3f1d1d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -78,6 +78,11 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); EM( BTRFS_FILE_EXTENT_REG, "REG") \ EMe(BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC") +#define QGROUP_RSV_TYPES \ + EM( BTRFS_QGROUP_RSV_DATA, "DATA") \ + EM( BTRFS_QGROUP_RSV_META_PERTRANS, "META_PERTRANS") \ + EMe(BTRFS_QGROUP_RSV_META_PREALLOC, "META_PREALLOC") + /* * First define the enums in the above macros to be exported to userspace via * TRACE_DEFINE_ENUM(). @@ -90,6 +95,7 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); FLUSH_ACTIONS FI_TYPES +QGROUP_RSV_TYPES /* * Now redefine the EM and EMe macros to map the enums to the strings that will @@ -101,12 +107,6 @@ FI_TYPES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -#define show_qgroup_rsv_type(type) \ - __print_symbolic(type, \ - { BTRFS_QGROUP_RSV_DATA, "DATA" }, \ - { BTRFS_QGROUP_RSV_META_PERTRANS, "META_PERTRANS" }, \ - { BTRFS_QGROUP_RSV_META_PREALLOC, "META_PREALLOC" }) - #define show_extent_io_tree_owner(owner) \ __print_symbolic(owner, \ { IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS" }, \ @@ -1712,7 +1712,7 @@ TRACE_EVENT(qgroup_update_reserve, ), TP_printk_btrfs("qgid=%llu type=%s cur_reserved=%llu diff=%lld", - __entry->qgid, show_qgroup_rsv_type(__entry->type), + __entry->qgid, __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->cur_reserved, __entry->diff) ); @@ -1736,7 +1736,7 @@ TRACE_EVENT(qgroup_meta_reserve, TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld", show_root_type(__entry->refroot), - show_qgroup_rsv_type(__entry->type), __entry->diff) + __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->diff) ); TRACE_EVENT(qgroup_meta_convert, @@ -1757,8 +1757,8 @@ TRACE_EVENT(qgroup_meta_convert, TP_printk_btrfs("refroot=%llu(%s) type=%s->%s diff=%lld", show_root_type(__entry->refroot), - show_qgroup_rsv_type(BTRFS_QGROUP_RSV_META_PREALLOC), - show_qgroup_rsv_type(BTRFS_QGROUP_RSV_META_PERTRANS), + __print_symbolic(BTRFS_QGROUP_RSV_META_PREALLOC, QGROUP_RSV_TYPES), + __print_symbolic(BTRFS_QGROUP_RSV_META_PERTRANS, QGROUP_RSV_TYPES), __entry->diff) ); @@ -1784,7 +1784,7 @@ TRACE_EVENT(qgroup_meta_free_all_pertrans, TP_printk_btrfs("refroot=%llu(%s) type=%s diff=%lld", show_root_type(__entry->refroot), - show_qgroup_rsv_type(__entry->type), __entry->diff) + __print_symbolic(__entry->type, QGROUP_RSV_TYPES), __entry->diff) ); DECLARE_EVENT_CLASS(btrfs__prelim_ref, From c92bb3046ff639df5c39fea6cfedea6e9885e36d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 19 Jun 2020 15:24:50 +0300 Subject: [PATCH 054/151] btrfs: tracepoints: switch extent_io_tree_owner to using EM macro This fixes correct pint out of the extent io tree owner in btrfs_set_extent_bit/btrfs_clear_extent_bit/btrfs_convert_extent_bit tracepoints. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f0e95e3f1d1d..880a5c0e0f21 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -83,6 +83,18 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); EM( BTRFS_QGROUP_RSV_META_PERTRANS, "META_PERTRANS") \ EMe(BTRFS_QGROUP_RSV_META_PREALLOC, "META_PREALLOC") +#define IO_TREE_OWNER \ + EM( IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS") \ + EM( IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS") \ + EM( IO_TREE_INODE_IO, "INODE_IO") \ + EM( IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE") \ + EM( IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS") \ + EM( IO_TREE_TRANS_DIRTY_PAGES, "TRANS_DIRTY_PAGES") \ + EM( IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES") \ + EM( IO_TREE_INODE_FILE_EXTENT, "INODE_FILE_EXTENT") \ + EM( IO_TREE_LOG_CSUM_RANGE, "LOG_CSUM_RANGE") \ + EMe(IO_TREE_SELFTEST, "SELFTEST") + /* * First define the enums in the above macros to be exported to userspace via * TRACE_DEFINE_ENUM(). @@ -96,6 +108,7 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); FLUSH_ACTIONS FI_TYPES QGROUP_RSV_TYPES +IO_TREE_OWNER /* * Now redefine the EM and EMe macros to map the enums to the strings that will @@ -107,18 +120,6 @@ QGROUP_RSV_TYPES #define EM(a, b) {a, b}, #define EMe(a, b) {a, b} -#define show_extent_io_tree_owner(owner) \ - __print_symbolic(owner, \ - { IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS" }, \ - { IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS" }, \ - { IO_TREE_INODE_IO, "INODE_IO" }, \ - { IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE" }, \ - { IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS" }, \ - { IO_TREE_TRANS_DIRTY_PAGES, "TRANS_DIRTY_PAGES" }, \ - { IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES" }, \ - { IO_TREE_INODE_FILE_EXTENT, "INODE_FILE_EXTENT" }, \ - { IO_TREE_LOG_CSUM_RANGE, "LOG_CSUM_RANGE" }, \ - { IO_TREE_SELFTEST, "SELFTEST" }) #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ @@ -1942,7 +1943,7 @@ TRACE_EVENT(btrfs_set_extent_bit, TP_printk_btrfs( "io_tree=%s ino=%llu root=%llu start=%llu len=%llu set_bits=%s", - show_extent_io_tree_owner(__entry->owner), __entry->ino, + __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino, __entry->rootid, __entry->start, __entry->len, __print_flags(__entry->set_bits, "|", EXTENT_FLAGS)) ); @@ -1981,7 +1982,7 @@ TRACE_EVENT(btrfs_clear_extent_bit, TP_printk_btrfs( "io_tree=%s ino=%llu root=%llu start=%llu len=%llu clear_bits=%s", - show_extent_io_tree_owner(__entry->owner), __entry->ino, + __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino, __entry->rootid, __entry->start, __entry->len, __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); @@ -2022,7 +2023,7 @@ TRACE_EVENT(btrfs_convert_extent_bit, TP_printk_btrfs( "io_tree=%s ino=%llu root=%llu start=%llu len=%llu set_bits=%s clear_bits=%s", - show_extent_io_tree_owner(__entry->owner), __entry->ino, + __print_symbolic(__entry->owner, IO_TREE_OWNER), __entry->ino, __entry->rootid, __entry->start, __entry->len, __print_flags(__entry->set_bits , "|", EXTENT_FLAGS), __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) From f0cdd15c219dd0be8cae47f60b773efe1361336e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 19 Jun 2020 15:24:51 +0300 Subject: [PATCH 055/151] btrfs: tracepoints: convert flush states to using EM macros Only 6 out of all flush states were being printed correctly since only they were exported via the TRACE_DEFINE_ENUM macro. This patch converts all flush states to use the newly introduced EM macro so that they can all be printed correctly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 880a5c0e0f21..863335ecb7e8 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -31,13 +31,6 @@ struct extent_io_tree; struct prelim_ref; struct btrfs_space_info; -TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR); -TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS); -TRACE_DEFINE_ENUM(FLUSH_DELALLOC); -TRACE_DEFINE_ENUM(FLUSH_DELALLOC_WAIT); -TRACE_DEFINE_ENUM(ALLOC_CHUNK); -TRACE_DEFINE_ENUM(COMMIT_TRANS); - #define show_ref_type(type) \ __print_symbolic(type, \ { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ @@ -95,6 +88,18 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); EM( IO_TREE_LOG_CSUM_RANGE, "LOG_CSUM_RANGE") \ EMe(IO_TREE_SELFTEST, "SELFTEST") +#define FLUSH_STATES \ + EM( FLUSH_DELAYED_ITEMS_NR, "FLUSH_DELAYED_ITEMS_NR") \ + EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \ + EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \ + EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \ + EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \ + EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \ + EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ + EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ + EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ + EMe(COMMIT_TRANS, "COMMIT_TRANS") + /* * First define the enums in the above macros to be exported to userspace via * TRACE_DEFINE_ENUM(). @@ -109,6 +114,7 @@ FLUSH_ACTIONS FI_TYPES QGROUP_RSV_TYPES IO_TREE_OWNER +FLUSH_STATES /* * Now redefine the EM and EMe macros to map the enums to the strings that will @@ -1101,18 +1107,6 @@ TRACE_EVENT(btrfs_trigger_flush, __entry->bytes) ); -#define show_flush_state(state) \ - __print_symbolic(state, \ - { FLUSH_DELAYED_ITEMS_NR, "FLUSH_DELAYED_ITEMS_NR"}, \ - { FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS"}, \ - { FLUSH_DELALLOC, "FLUSH_DELALLOC"}, \ - { FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT"}, \ - { FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR"}, \ - { FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS"}, \ - { ALLOC_CHUNK, "ALLOC_CHUNK"}, \ - { ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE"}, \ - { RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS"}, \ - { COMMIT_TRANS, "COMMIT_TRANS"}) TRACE_EVENT(btrfs_flush_space, @@ -1137,7 +1131,7 @@ TRACE_EVENT(btrfs_flush_space, TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d", __entry->state, - show_flush_state(__entry->state), + __print_symbolic(__entry->state, FLUSH_STATES), __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), From df2cfd131fd33dbef1ce33be8b332b1f3d645f35 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:09 +0300 Subject: [PATCH 056/151] btrfs: make qgroup_free_reserved_data take btrfs_inode It only uses btrfs_inode so can just as easily take it as an argument. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 74eb98479109..d0031afe993a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3491,10 +3491,10 @@ cleanup: } /* Free ranges specified by @reserved, normally in error path */ -static int qgroup_free_reserved_data(struct inode *inode, +static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; @@ -3530,8 +3530,8 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, - free_start, free_start + free_len - 1, + ret = clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -3560,7 +3560,8 @@ static int __btrfs_qgroup_release_data(struct inode *inode, /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(BTRFS_I(inode), reserved, + start, len); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); From 8769af96cf6355ae96c808265a05171226b4b6e7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:10 +0300 Subject: [PATCH 057/151] btrfs: make __btrfs_qgroup_release_data take btrfs_inode It uses vfs_inode only for a tracepoint so convert its interface to take btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d0031afe993a..de2812ee73c1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3545,7 +3545,7 @@ out: return ret; } -static int __btrfs_qgroup_release_data(struct inode *inode, +static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, int free) { @@ -3553,28 +3553,26 @@ static int __btrfs_qgroup_release_data(struct inode *inode, int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, - &BTRFS_I(inode)->root->fs_info->flags)) + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) return 0; /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(BTRFS_I(inode), reserved, - start, len); + return qgroup_free_reserved_data(inode, reserved, start, len); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, &changeset); + ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; if (free) trace_op = QGROUP_FREE; - trace_btrfs_qgroup_release_data(inode, start, len, + trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, changeset.bytes_changed, trace_op); if (free) - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->root_key.objectid, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); ret = changeset.bytes_changed; out: @@ -3597,7 +3595,7 @@ out: int btrfs_qgroup_free_data(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(BTRFS_I(inode), reserved, start, len, 1); } /* @@ -3617,7 +3615,7 @@ int btrfs_qgroup_free_data(struct inode *inode, */ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(BTRFS_I(inode), NULL, start, len, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, From 8b8a979f1fc69054f99abab80daeae89aba3f19b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:11 +0300 Subject: [PATCH 058/151] btrfs: make btrfs_qgroup_free_data take btrfs_inode It passes btrfs_inode to its callee so change the interface. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 2 +- fs/btrfs/inode.c | 7 ++++--- fs/btrfs/ordered-data.c | 3 +-- fs/btrfs/qgroup.c | 4 ++-- fs/btrfs/qgroup.h | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index d05648f882ca..31440137accf 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -301,7 +301,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, start = round_down(start, root->fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(inode, len); - btrfs_qgroup_free_data(inode, reserved, start, len); + btrfs_qgroup_free_data(BTRFS_I(inode), reserved, start, len); } /** diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bd51365e53fb..ec3d13303ebc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -354,7 +354,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -4994,7 +4994,8 @@ static void evict_inode_truncate_pages(struct inode *inode) * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state_flags & EXTENT_DELALLOC) - btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, + end - start + 1); clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -8178,7 +8179,7 @@ again: * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index cb737729f610..5f9ba8280560 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -173,8 +173,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(&inode->vfs_inode, NULL, file_offset, - num_bytes); + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) return ret; ret = 0; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index de2812ee73c1..7795c7616fe8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3592,10 +3592,10 @@ out: * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_free_data(struct inode *inode, +int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - return __btrfs_qgroup_release_data(BTRFS_I(inode), reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); } /* diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 3be5198a3719..0449bfd6fe45 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -347,9 +347,9 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, int btrfs_qgroup_reserve_data(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); -int btrfs_qgroup_free_data(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); - +int btrfs_qgroup_free_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); /* Reserve metadata space for pertrans and prealloc type */ From a0349401c14f507990bbe052406498fd527f7df7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:12 +0300 Subject: [PATCH 059/151] btrfs: make cow_file_range_inline take btrfs_inode It has only 2 uses for the vfs_inode - insert_inline_extent and i_size_read. On the flipside it will allow converting its callers to btrfs_inode, so convert it to taking btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec3d13303ebc..7a815f1d9e0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -274,15 +274,15 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct inode *inode, u64 start, +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start, u64 end, size_t compressed_size, int compress_type, struct page **compressed_pages) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; u64 aligned_end = ALIGN(end, fs_info->sectorsize); @@ -314,7 +314,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &BTRFS_I(inode)->block_rsv; + trans->block_rsv = &inode->block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -323,9 +323,9 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, extent_item_size = btrfs_file_extent_calc_inline_size( inline_len); - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, - start, aligned_end, NULL, - 1, 1, extent_item_size, &extent_inserted); + ret = __btrfs_drop_extents(trans, root, inode, path, start, aligned_end, + NULL, 1, 1, extent_item_size, + &extent_inserted); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -334,7 +334,7 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, path, extent_inserted, - root, inode, start, + root, &inode->vfs_inode, start, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { @@ -345,8 +345,8 @@ static noinline int cow_file_range_inline(struct inode *inode, u64 start, goto out; } - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: /* * Don't forget to free the reserved space, as for inlined extent @@ -354,7 +354,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -616,11 +616,12 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(inode, start, end, 0, - BTRFS_COMPRESS_NONE, NULL); + ret = cow_file_range_inline(BTRFS_I(inode), start, end, + 0, BTRFS_COMPRESS_NONE, + NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(inode, start, end, + ret = cow_file_range_inline(BTRFS_I(inode), start, end, total_compressed, compress_type, pages); } @@ -1009,7 +1010,7 @@ static noinline int cow_file_range(struct inode *inode, if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, start, end, 0, + ret = cow_file_range_inline(BTRFS_I(inode), start, end, 0, BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { /* From e7fbf60453a7eae2a36ac9096c84ccb1067dabdf Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:13 +0300 Subject: [PATCH 060/151] btrfs: make btrfs_add_ordered_extent take btrfs_inode Preparation to converting its callers to taking btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++---- fs/btrfs/ordered-data.c | 4 ++-- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7a815f1d9e0d..2d053c33c380 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1081,8 +1081,9 @@ static noinline int cow_file_range(struct inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, 0); + ret = btrfs_add_ordered_extent(BTRFS_I(inode), start, + ins.objectid, ram_size, + cur_alloc_size, 0); if (ret) goto out_drop_extent_cache; @@ -1716,7 +1717,7 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, cur_offset, + ret = btrfs_add_ordered_extent(BTRFS_I(inode), cur_offset, disk_bytenr, num_bytes, num_bytes, BTRFS_ORDERED_PREALLOC); @@ -1728,7 +1729,7 @@ out_check: goto error; } } else { - ret = btrfs_add_ordered_extent(inode, cur_offset, + ret = btrfs_add_ordered_extent(BTRFS_I(inode), cur_offset, disk_bytenr, num_bytes, num_bytes, BTRFS_ORDERED_NOCOW); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5f9ba8280560..156063373cc1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -252,11 +252,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset return 0; } -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, disk_bytenr, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f2a78f8f6bce..6c42f307b87f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -151,7 +151,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 *file_offset, u64 io_size, int uptodate); -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, From 6e26c442233b9e2ecdebdfb5b75fd114b15884df Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:14 +0300 Subject: [PATCH 061/151] btrfs: make cow_file_range take btrfs_inode All its children functions take btrfs_inode so convert it to taking btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 52 ++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2d053c33c380..f1b66901dc55 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -80,7 +80,7 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); -static noinline int cow_file_range(struct inode *inode, +static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); @@ -789,7 +789,8 @@ retry: unsigned long nr_written = 0; /* allocate blocks */ - ret = cow_file_range(inode, async_chunk->locked_page, + ret = cow_file_range(BTRFS_I(inode), + async_chunk->locked_page, async_extent->start, async_extent->start + async_extent->ram_size - 1, @@ -976,13 +977,13 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int cow_file_range(struct inode *inode, +static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -996,7 +997,7 @@ static noinline int cow_file_range(struct inode *inode, bool extent_reserved = false; int ret = 0; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (btrfs_is_free_space_inode(inode)) { WARN_ON_ONCE(1); ret = -EINVAL; goto out_unlock; @@ -1006,11 +1007,11 @@ static noinline int cow_file_range(struct inode *inode, num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); - inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K); + inode_should_defrag(inode, start, end, num_bytes, SZ_64K); if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), start, end, 0, + ret = cow_file_range_inline(inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { /* @@ -1019,8 +1020,7 @@ static noinline int cow_file_range(struct inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, - NULL, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1035,10 +1035,8 @@ static noinline int cow_file_range(struct inode *inode, } } - alloc_hint = get_extent_allocation_hint(BTRFS_I(inode), start, - num_bytes); - btrfs_drop_extent_cache(BTRFS_I(inode), start, - start + num_bytes - 1, 0); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); /* * Relocation relies on the relocated extents to have exactly the same @@ -1067,7 +1065,7 @@ static noinline int cow_file_range(struct inode *inode, extent_reserved = true; ram_size = ins.offset; - em = create_io_em(BTRFS_I(inode), start, ins.offset, /* len */ + em = create_io_em(inode, start, ins.offset, /* len */ start, /* orig_start */ ins.objectid, /* block_start */ ins.offset, /* block_len */ @@ -1081,15 +1079,14 @@ static noinline int cow_file_range(struct inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(BTRFS_I(inode), start, - ins.objectid, ram_size, - cur_alloc_size, 0); + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); if (ret) goto out_drop_extent_cache; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_reloc_clone_csums(BTRFS_I(inode), start, + ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); /* * Only drop cache here, and process as normal. @@ -1103,7 +1100,7 @@ static noinline int cow_file_range(struct inode *inode, * skip current ordered extent. */ if (ret) - btrfs_drop_extent_cache(BTRFS_I(inode), start, + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } @@ -1119,8 +1116,7 @@ static noinline int cow_file_range(struct inode *inode, page_ops = unlock ? PAGE_UNLOCK : 0; page_ops |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(BTRFS_I(inode), start, - start + ram_size - 1, + extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); @@ -1144,7 +1140,7 @@ out: return ret; out_drop_extent_cache: - btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0); + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); @@ -1164,7 +1160,7 @@ out_unlock: * it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { - extent_clear_unlock_delalloc(BTRFS_I(inode), start, + extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_page, clear_bits, @@ -1173,7 +1169,7 @@ out_unlock: if (start >= end) goto out; } - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); goto out; @@ -1441,8 +1437,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, 0, 0, NULL); } - return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + return cow_file_range(BTRFS_I(inode), locked_page, start, end, + page_started, nr_written, 1); } /* @@ -1838,7 +1834,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, + ret = cow_file_range(BTRFS_I(inode), locked_page, start, end, page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, From 4cc612090ba5828fb301623fa8cdf0d7a165f91c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:15 +0300 Subject: [PATCH 062/151] btrfs: make btrfs_add_ordered_extent_compress take btrfs_inode It simpy forwards its inode argument to __btrfs_add_ordered_extent which already takes btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.c | 4 ++-- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1b66901dc55..982fe4cca715 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -862,7 +862,7 @@ retry: goto out_free_reserve; free_extent_map(em); - ret = btrfs_add_ordered_extent_compress(inode, + ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), async_extent->start, ins.objectid, async_extent->ram_size, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 156063373cc1..838fec9e245f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -270,12 +270,12 @@ int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, BTRFS_COMPRESS_NONE); } -int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type, int compress_type) { - return __btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, disk_bytenr, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, compress_type); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 6c42f307b87f..03865f721164 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type, int compress_type); From c7ee1819dc7169348eb93a088970ae143aa27435 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:16 +0300 Subject: [PATCH 063/151] btrfs: make btrfs_submit_compressed_write take btrfs_inode Majority of its uses are for btrfs_inode so take it as an argument directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 15 +++++++-------- fs/btrfs/compression.h | 4 +++- fs/btrfs/inode.c | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4f52cd8af517..c2d5ca583dbf 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -405,7 +405,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -413,7 +413,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; struct compressed_bio *cb; unsigned long bytes_left; @@ -421,7 +421,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; blk_status_t ret; - int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -429,7 +429,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, return BLK_STS_RESOURCE; refcount_set(&cb->pending_bios, 0); cb->errors = 0; - cb->inode = inode; + cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; cb->mirror_num = 0; @@ -455,7 +455,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, int submit = 0; page = compressed_pages[pg_index]; - page->mapping = inode->i_mapping; + page->mapping = inode->vfs_inode.i_mapping; if (bio->bi_iter.bi_size) submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); @@ -475,8 +475,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, - start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, 1); BUG_ON(ret); /* -ENOMEM */ } @@ -508,7 +507,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, 1); BUG_ON(ret); /* -ENOMEM */ } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 284a3ad31350..9f3dbe372631 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -8,6 +8,8 @@ #include +struct btrfs_inode; + /* * We want to make sure that amount of RAM required to uncompress an extent is * reasonable, so we limit the total size in ram of a compressed extent to @@ -88,7 +90,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 982fe4cca715..e019800eaab2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -887,7 +887,7 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - if (btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(BTRFS_I(inode), async_extent->start, async_extent->ram_size, ins.objectid, From a0ff10dcc4a5185e9df2e9d5349f8b03cc909b23 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:17 +0300 Subject: [PATCH 064/151] btrfs: make submit_compressed_extents take btrfs_inode All but 3 uses require vfs_inode so convert the logic to have btrfs_inode be the main inode struct. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e019800eaab2..7aeaa9fe18a6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -764,14 +764,14 @@ static void free_async_extent_pages(struct async_extent *async_extent) */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_root *root = inode->root; + struct extent_io_tree *io_tree = &inode->io_tree; int ret = 0; again: @@ -789,8 +789,7 @@ retry: unsigned long nr_written = 0; /* allocate blocks */ - ret = cow_file_range(BTRFS_I(inode), - async_chunk->locked_page, + ret = cow_file_range(inode, async_chunk->locked_page, async_extent->start, async_extent->start + async_extent->ram_size - 1, @@ -805,7 +804,7 @@ retry: * all those pages down to the drive. */ if (!page_started && !ret) - extent_write_locked_range(inode, + extent_write_locked_range(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, @@ -835,7 +834,7 @@ retry: * will not submit these pages down to lower * layers. */ - extent_range_redirty_for_io(inode, + extent_range_redirty_for_io(&inode->vfs_inode, async_extent->start, async_extent->start + async_extent->ram_size - 1); @@ -848,7 +847,7 @@ retry: * here we're doing allocation and writeback of the * compressed pages */ - em = create_io_em(BTRFS_I(inode), async_extent->start, + em = create_io_em(inode, async_extent->start, async_extent->ram_size, /* len */ async_extent->start, /* orig_start */ ins.objectid, /* block_start */ @@ -862,7 +861,7 @@ retry: goto out_free_reserve; free_extent_map(em); - ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), + ret = btrfs_add_ordered_extent_compress(inode, async_extent->start, ins.objectid, async_extent->ram_size, @@ -870,8 +869,7 @@ retry: BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { - btrfs_drop_extent_cache(BTRFS_I(inode), - async_extent->start, + btrfs_drop_extent_cache(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, 0); goto out_free_reserve; @@ -881,14 +879,13 @@ retry: /* * clear dirty, set writeback and unlock the pages. */ - extent_clear_unlock_delalloc(BTRFS_I(inode), async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - if (btrfs_submit_compressed_write(BTRFS_I(inode), - async_extent->start, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, @@ -899,12 +896,11 @@ retry: const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; - p->mapping = inode->i_mapping; + p->mapping = inode->vfs_inode.i_mapping; btrfs_writepage_endio_finish_ordered(p, start, end, 0); p->mapping = NULL; - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, - NULL, 0, + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); @@ -918,7 +914,7 @@ out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: - extent_clear_unlock_delalloc(BTRFS_I(inode), async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | From 72b7d15bf1e1ee7b71da81e6c5d9afcd2e86c426 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:18 +0300 Subject: [PATCH 065/151] btrfs: make btrfs_qgroup_release_data take btrfs_inode It just forwards its argument to __btrfs_qgroup_release_data. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.c | 3 +-- fs/btrfs/qgroup.c | 4 ++-- fs/btrfs/qgroup.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7aeaa9fe18a6..49d0d3d528b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9643,7 +9643,7 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - ret = btrfs_qgroup_release_data(inode, file_offset, len); + ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) return ret; return insert_reserved_file_extent(trans, inode, file_offset, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 838fec9e245f..855c60d23b4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -182,8 +182,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(&inode->vfs_inode, file_offset, - num_bytes); + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); if (ret < 0) return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7795c7616fe8..01a2b03648be 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3613,9 +3613,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) { - return __btrfs_qgroup_release_data(BTRFS_I(inode), NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 0449bfd6fe45..66e80dc91242 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -346,7 +346,7 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); From c553f94df4d1c5f37ec253b0ff40a2362af03fc1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:19 +0300 Subject: [PATCH 066/151] btrfs: make insert_reserved_file_extent take btrfs_inode Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba c Signed-off-by: David Sterba --- fs/btrfs/inode.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 49d0d3d528b5..f3b711e39dbe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2464,11 +2464,11 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, u64 file_pos, + struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, u64 qgroup_reserved) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; @@ -2492,14 +2492,14 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, file_pos, + ret = __btrfs_drop_extents(trans, root, inode, path, file_pos, file_pos + num_bytes, NULL, 0, 1, sizeof(*stack_fi), &extent_inserted); if (ret) goto out; if (!extent_inserted) { - ins.objectid = btrfs_ino(BTRFS_I(inode)); + ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; @@ -2518,19 +2518,17 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - inode_add_bytes(inode, num_bytes); + inode_add_bytes(&inode->vfs_inode, num_bytes); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos, - ram_bytes); + ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) goto out; - ret = btrfs_alloc_reserved_file_extent(trans, root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), file_pos, qgroup_reserved, &ins); out: btrfs_free_path(path); @@ -2574,7 +2572,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, inode, oe->file_offset, + return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, &stack_fi, oe->qgroup_rsv); } @@ -9646,7 +9644,7 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) return ret; - return insert_reserved_file_extent(trans, inode, file_offset, + return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, &stack_fi, ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -9707,8 +9705,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, - cur_offset); + ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); if (ret) { btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); From 8ba96f3dd6a0b2dac1a3c48ece76885aa5e40e66 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:20 +0300 Subject: [PATCH 067/151] btrfs: make fallback_to_cow take btrfs_inode It really wants btrfs_inode and is prepration to converting run_delalloc_nocow to taking btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f3b711e39dbe..329b5ad7fe59 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1370,15 +1370,15 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, return 1; } -static int fallback_to_cow(struct inode *inode, struct page *locked_page, +static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, int *page_started, unsigned long *nr_written) { - const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); - const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid == + const bool is_space_ino = btrfs_is_free_space_inode(inode); + const bool is_reloc_ino = (inode->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); const u64 range_bytes = end + 1 - start; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; @@ -1418,7 +1418,7 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, EXTENT_NORESERVE, 0); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; if (is_space_ino || is_reloc_ino) @@ -1433,8 +1433,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, 0, 0, NULL); } - return cow_file_range(BTRFS_I(inode), locked_page, start, end, - page_started, nr_written, 1); + return cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 1); } /* @@ -1685,8 +1685,8 @@ out_check: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(inode, locked_page, cow_start, - found_key.offset - 1, + ret = fallback_to_cow(BTRFS_I(inode), locked_page, + cow_start, found_key.offset - 1, page_started, nr_written); if (ret) goto error; @@ -1769,8 +1769,8 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end, - page_started, nr_written); + ret = fallback_to_cow(BTRFS_I(inode), locked_page, cow_start, + end, page_started, nr_written); if (ret) goto error; } From 968322c8c6d593fb15f26f73cb630c820c1a20f5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:21 +0300 Subject: [PATCH 068/151] btrfs: make run_delalloc_nocow take btrfs_inode It only really uses btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 329b5ad7fe59..c0f8db1bca9c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1444,28 +1444,27 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static noinline int run_delalloc_nocow(struct inode *inode, +static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, int *page_started, int force, unsigned long *nr_written) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; u64 cur_offset = start; int ret; bool check_prev = true; - const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode)); - u64 ino = btrfs_ino(BTRFS_I(inode)); + const bool freespace_inode = btrfs_is_free_space_inode(inode); + u64 ino = btrfs_ino(inode); bool nocow = false; u64 disk_bytenr = 0; path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, - locked_page, + extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | @@ -1685,7 +1684,7 @@ out_check: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = fallback_to_cow(BTRFS_I(inode), locked_page, + ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written); if (ret) @@ -1697,7 +1696,7 @@ out_check: u64 orig_start = found_key.offset - extent_offset; struct extent_map *em; - em = create_io_em(BTRFS_I(inode), cur_offset, num_bytes, + em = create_io_em(inode, cur_offset, num_bytes, orig_start, disk_bytenr, /* block_start */ num_bytes, /* block_len */ @@ -1709,19 +1708,18 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(BTRFS_I(inode), cur_offset, + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, BTRFS_ORDERED_PREALLOC); if (ret) { - btrfs_drop_extent_cache(BTRFS_I(inode), - cur_offset, + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + num_bytes - 1, 0); goto error; } } else { - ret = btrfs_add_ordered_extent(BTRFS_I(inode), cur_offset, + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, BTRFS_ORDERED_NOCOW); @@ -1740,10 +1738,10 @@ out_check: * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ - ret = btrfs_reloc_clone_csums(BTRFS_I(inode), cur_offset, + ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - extent_clear_unlock_delalloc(BTRFS_I(inode), cur_offset, + extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -1769,8 +1767,8 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(BTRFS_I(inode), locked_page, cow_start, - end, page_started, nr_written); + ret = fallback_to_cow(inode, locked_page, cow_start, end, + page_started, nr_written); if (ret) goto error; } @@ -1780,7 +1778,7 @@ error: btrfs_dec_nocow_writers(fs_info, disk_bytenr); if (ret && cur_offset < end) - extent_clear_unlock_delalloc(BTRFS_I(inode), cur_offset, end, + extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1823,10 +1821,10 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, int force_cow = need_force_cow(inode, start, end); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { - ret = run_delalloc_nocow(inode, locked_page, start, end, + ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, page_started, 1, nr_written); } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { - ret = run_delalloc_nocow(inode, locked_page, start, end, + ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { From 751b64318d4c6b09f6bb9be4313ed742ec2293f9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:22 +0300 Subject: [PATCH 069/151] btrfs: make cow_file_range_async take btrfs_inode It only uses vfs inode for assigning it to the async_chunk function. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0f8db1bca9c..94ba5248e201 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1233,13 +1233,13 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_chunk->pending); } -static int cow_file_range_async(struct inode *inode, +static int cow_file_range_async(struct btrfs_inode *inode, struct writeback_control *wbc, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; @@ -1251,9 +1251,9 @@ static int cow_file_range_async(struct inode *inode, unsigned nofs_flag; const unsigned int write_flags = wbc_to_write_flags(wbc); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + if (inode->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { num_chunks = 1; should_compress = false; @@ -1273,8 +1273,8 @@ static int cow_file_range_async(struct inode *inode, PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | PAGE_SET_ERROR; - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, - locked_page, clear_bits, page_ops); + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); return -ENOMEM; } @@ -1291,9 +1291,9 @@ static int cow_file_range_async(struct inode *inode, * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ - ihold(inode); + ihold(&inode->vfs_inode); async_chunk[i].pending = &ctx->num_chunks; - async_chunk[i].inode = inode; + async_chunk[i].inode = &inode->vfs_inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; @@ -1833,7 +1833,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); - ret = cow_file_range_async(inode, wbc, locked_page, start, end, + ret = cow_file_range_async(BTRFS_I(inode), wbc, locked_page, start, end, page_started, nr_written); } if (ret) From 7095821ee1f57d9d4b179463738776e0fcd28d38 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:23 +0300 Subject: [PATCH 070/151] btrfs: make btrfs_dec_test_first_ordered_pending take btrfs_inode It doesn't really need vfs_inode but btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ordered-data.c | 7 +++---- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 94ba5248e201..20bb3a4072fe 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7578,10 +7578,10 @@ static void __endio_write_update_ordered(struct inode *inode, while (ordered_offset < offset + bytes) { last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { + if (btrfs_dec_test_first_ordered_pending(BTRFS_I(inode), &ordered, + &ordered_offset, + ordered_bytes, + uptodate)) { btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &ordered->work); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 855c60d23b4c..af1b444421fb 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -307,12 +307,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, * file_offset is updated to one byte past the range that is recorded as * complete. This allows you to walk forward in the file. */ -int btrfs_dec_test_first_ordered_pending(struct inode *inode, +int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 *file_offset, u64 io_size, int uptodate) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_ordered_inode_tree *tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; int ret; @@ -321,7 +321,6 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, u64 dec_start; u64 to_dec; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); if (!node) { diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 03865f721164..c2432f0165e1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -147,7 +147,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); -int btrfs_dec_test_first_ordered_pending(struct inode *inode, +int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 *file_offset, u64 io_size, int uptodate); From b672b5c1563094d890e8a9c8545bce36d98baf12 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:24 +0300 Subject: [PATCH 071/151] btrfs: make __endio_write_update_ordered take btrfs_inode It really wants btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 20bb3a4072fe..c718fdd57020 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -90,7 +90,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct inode *inode, +static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate); @@ -134,7 +134,8 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, bytes -= PAGE_SIZE; } - return __endio_write_update_ordered(inode, offset, bytes, false); + return __endio_write_update_ordered(BTRFS_I(inode), offset, bytes, + false); } static int btrfs_dirty_inode(struct inode *inode); @@ -7474,7 +7475,8 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { - __endio_write_update_ordered(dip->inode, dip->logical_offset, + __endio_write_update_ordered(BTRFS_I(dip->inode), + dip->logical_offset, dip->bytes, !dip->dio_bio->bi_status); } else { @@ -7560,25 +7562,25 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, return err; } -static void __endio_write_update_ordered(struct inode *inode, +static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered = NULL; struct btrfs_workqueue *wq; u64 ordered_offset = offset; u64 ordered_bytes = bytes; u64 last_offset; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; while (ordered_offset < offset + bytes) { last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(BTRFS_I(inode), &ordered, + if (btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, uptodate)) { @@ -7961,7 +7963,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(inode, + __endio_write_update_ordered(BTRFS_I(inode), dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, From 64e1db566deb5fd5bd9ece981b609de7c540c12a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:25 +0300 Subject: [PATCH 072/151] btrfs: make btrfs_cleanup_ordered_extents take btrfs_inode Preparation to converting btrfs_run_delalloc_range to using btrfs_inode without BTRFS_I() calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c718fdd57020..445085cd58be 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -104,7 +104,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ -static inline void btrfs_cleanup_ordered_extents(struct inode *inode, +static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 bytes) { @@ -116,7 +116,7 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, struct page *page; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; @@ -134,8 +134,7 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, bytes -= PAGE_SIZE; } - return __endio_write_update_ordered(BTRFS_I(inode), offset, bytes, - false); + return __endio_write_update_ordered(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -1838,7 +1837,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, page_started, nr_written); } if (ret) - btrfs_cleanup_ordered_extents(inode, locked_page, start, + btrfs_cleanup_ordered_extents(BTRFS_I(inode), locked_page, start, end - start + 1); return ret; } From 99c88dc71cae52050892798e20c63d5d7053bb94 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:26 +0300 Subject: [PATCH 073/151] btrfs: make inode_can_compress take btrfs_inode Gets rid of superfluous BTRFS_I() calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 445085cd58be..0d3f0b3c1621 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -412,10 +412,10 @@ static noinline int add_async_extent(struct async_chunk *cow, /* * Check if the inode has flags compatible with compression */ -static inline bool inode_can_compress(struct inode *inode) +static inline bool inode_can_compress(struct btrfs_inode *inode) { - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW || - BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) return false; return true; } @@ -428,7 +428,7 @@ static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if (!inode_can_compress(inode)) { + if (!inode_can_compress(BTRFS_I(inode))) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(BTRFS_I(inode))); @@ -1826,7 +1826,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, page_started, 0, nr_written); - } else if (!inode_can_compress(inode) || + } else if (!inode_can_compress(BTRFS_I(inode)) || !inode_need_compress(inode, start, end)) { ret = cow_file_range(BTRFS_I(inode), locked_page, start, end, page_started, nr_written, 1); From 808a12923203ee1b992a278b6f8d4e53b69d1508 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:27 +0300 Subject: [PATCH 074/151] btrfs: make inode_need_compress take btrfs_inode Simply gets rid of superfluous BTRFS_I() calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0d3f0b3c1621..64ac763d4a03 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -424,29 +424,30 @@ static inline bool inode_can_compress(struct btrfs_inode *inode) * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ -static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) +static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (!inode_can_compress(BTRFS_I(inode))) { + if (!inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(BTRFS_I(inode))); + btrfs_ino(inode)); return 0; } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* defrag ioctl */ - if (BTRFS_I(inode)->defrag_compress) + if (inode->defrag_compress) return 1; /* bad compression ratios */ - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || - BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || - BTRFS_I(inode)->prop_compress) - return btrfs_compress_heuristic(inode, start, end); + inode->flags & BTRFS_INODE_COMPRESS || + inode->prop_compress) + return btrfs_compress_heuristic(&inode->vfs_inode, start, end); return 0; } @@ -552,7 +553,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { + if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -1827,7 +1828,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(BTRFS_I(inode)) || - !inode_need_compress(inode, start, end)) { + !inode_need_compress(BTRFS_I(inode), start, end)) { ret = cow_file_range(BTRFS_I(inode), locked_page, start, end, page_started, nr_written, 1); } else { From 0c4942258cc1de265619b48a50a29d81f85b86eb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:28 +0300 Subject: [PATCH 075/151] btrfs: make need_force_cow take btrfs_inode Gets rid of superfulous BTRFS_I() calls and prepare for converting btrfs_run_delalloc_range to using btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 64ac763d4a03..eaa8e71288c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1790,11 +1790,11 @@ error: return ret; } -static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + !(inode->flags & BTRFS_INODE_PREALLOC)) return 0; /* @@ -1802,9 +1802,8 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) * if is not zero, it means the file is defragging. * Force cow if given extent needs to be defragged. */ - if (BTRFS_I(inode)->defrag_bytes && - test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_DEFRAG, 0, NULL)) + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) return 1; return 0; @@ -1819,7 +1818,7 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, struct writeback_control *wbc) { int ret; - int force_cow = need_force_cow(inode, start, end); + int force_cow = need_force_cow(BTRFS_I(inode), start, end); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, From 98456b9c46c140275b0fe81970ddfad250c68ed4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:29 +0300 Subject: [PATCH 076/151] btrfs: make btrfs_run_delalloc_range take btrfs_inode All children now take btrfs_inode so convert it to taking it as a parameter as well. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 27 +++++++++++++-------------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 654b99af1587..65e0630c9373 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2979,7 +2979,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e3078ed2261b..fde2fcaeefaa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3439,7 +3439,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, delalloc_start = delalloc_end + 1; continue; } - ret = btrfs_run_delalloc_range(inode, page, delalloc_start, + ret = btrfs_run_delalloc_range(BTRFS_I(inode), page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); if (ret) { SetPageError(page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eaa8e71288c8..aa48452e395f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1813,31 +1813,30 @@ static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ -int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { int ret; - int force_cow = need_force_cow(BTRFS_I(inode), start, end); + int force_cow = need_force_cow(inode, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { - ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, + if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { - ret = run_delalloc_nocow(BTRFS_I(inode), locked_page, start, end, + } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!inode_can_compress(BTRFS_I(inode)) || - !inode_need_compress(BTRFS_I(inode), start, end)) { - ret = cow_file_range(BTRFS_I(inode), locked_page, start, end, - page_started, nr_written, 1); + } else if (!inode_can_compress(inode) || + !inode_need_compress(inode, start, end)) { + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { - set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags); - ret = cow_file_range_async(BTRFS_I(inode), wbc, locked_page, start, end, + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); + ret = cow_file_range_async(inode, wbc, locked_page, start, end, page_started, nr_written); } if (ret) - btrfs_cleanup_ordered_extents(BTRFS_I(inode), locked_page, start, + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; } From c1e095202caae87ac4a5c353691a492692d61857 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:30 +0300 Subject: [PATCH 077/151] btrfs: make btrfs_add_ordered_extent_dio take btrfs_inode Simply forwards its argument so let's get rid of one extra BTRFS_I by taking btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.c | 4 ++-- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa48452e395f..412693947484 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6890,7 +6890,7 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), start, block_start, len, block_len, type); if (ret) { if (em) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index af1b444421fb..ebac13389e7e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -260,11 +260,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, BTRFS_COMPRESS_NONE); } -int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, disk_bytenr, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c2432f0165e1..d61ea9c880a3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -154,7 +154,7 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); -int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, +int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, From 64f54188ea4309a22eaf09df169ea5d0f0021640 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:31 +0300 Subject: [PATCH 078/151] btrfs: make btrfs_create_dio_extent take btrfs_inode Take btrfs_inode directly and stop using superfulous BTRFS_I calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 412693947484..7415a6486072 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6868,7 +6868,7 @@ out: return em; } -static struct extent_map *btrfs_create_dio_extent(struct inode *inode, +static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, const u64 start, const u64 len, const u64 orig_start, @@ -6882,21 +6882,19 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, int ret; if (type != BTRFS_ORDERED_NOCOW) { - em = create_io_em(BTRFS_I(inode), start, len, orig_start, - block_start, block_len, orig_block_len, - ram_bytes, + em = create_io_em(inode, start, len, orig_start, block_start, + block_len, orig_block_len, ram_bytes, BTRFS_COMPRESS_NONE, /* compress_type */ type); if (IS_ERR(em)) goto out; } - ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), start, block_start, - len, block_len, type); + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, + block_len, type); if (ret) { if (em) { free_extent_map(em); - btrfs_drop_extent_cache(BTRFS_I(inode), start, - start + len - 1, 0); + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } em = ERR_PTR(ret); } @@ -6921,7 +6919,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - em = btrfs_create_dio_extent(inode, start, ins.offset, start, + em = btrfs_create_dio_extent(BTRFS_I(inode), start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -7295,7 +7293,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; - em2 = btrfs_create_dio_extent(inode, start, len, + em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); From 9fc6f911a014fab57ed661b6918b7f730fa60c59 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:32 +0300 Subject: [PATCH 079/151] btrfs: make btrfs_new_extent_direct take btrfs_inode This function really needs a btrfs_inode and not a generic vfs one. Take it as a parameter and get rid of superfluous BTRFS_I() calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7415a6486072..5abb6d0a8cac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6903,29 +6903,29 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, return em; } -static struct extent_map *btrfs_new_extent_direct(struct inode *inode, +static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, u64 start, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; - alloc_hint = get_extent_allocation_hint(BTRFS_I(inode), start, len); + alloc_hint = get_extent_allocation_hint(inode, start, len); ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); - em = btrfs_create_dio_extent(BTRFS_I(inode), start, ins.offset, start, + em = btrfs_create_dio_extent(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, + 1); return em; } @@ -7320,7 +7320,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, /* this will cow the extent */ len = bh_result->b_size; free_extent_map(em); - *map = em = btrfs_new_extent_direct(inode, start, len); + *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; From d4580fe25dd397b6f15bfc3eb628d28944ffd26b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:33 +0300 Subject: [PATCH 080/151] btrfs: make __extent_writepage_io take btrfs_inode It has only a single use for a generic vfs inode vs 3 for btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fde2fcaeefaa..cb9969ffafbd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3496,7 +3496,7 @@ done: * 0 if all went well (page still locked) * < 0 if there were errors (page still locked) */ -static noinline_for_stack int __extent_writepage_io(struct inode *inode, +static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, struct extent_page_data *epd, @@ -3504,7 +3504,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, unsigned long nr_written, int *nr_ret) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); u64 page_end = start + PAGE_SIZE - 1; u64 end; @@ -3536,7 +3536,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, update_nr_written(wbc, nr_written + 1); end = page_end; - blocksize = inode->i_sb->s_blocksize; + blocksize = inode->vfs_inode.i_sb->s_blocksize; while (cur <= end) { u64 em_end; @@ -3547,8 +3547,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, - end - cur + 1); + em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); @@ -3585,7 +3584,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, btrfs_set_range_writeback(tree, cur, cur + iosize - 1); if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, + btrfs_err(inode->root->fs_info, "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } @@ -3665,8 +3664,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - ret = __extent_writepage_io(inode, page, wbc, epd, - i_size, nr_written, &nr); + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + nr_written, &nr); if (ret == 1) return 0; From cd4c0bf94292b60ab6b0f5dfa5e1297572800f1f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 5 Jun 2020 10:42:10 +0300 Subject: [PATCH 081/151] btrfs: make writepage_delalloc take btrfs_inode Only find_lock_delalloc_range uses vfs_inode so let's take the btrfs_inode as a parameter. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cb9969ffafbd..abd36d92c077 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3419,7 +3419,7 @@ static void update_nr_written(struct writeback_control *wbc, * This returns 0 if all went well (page still locked) * This returns < 0 if there were errors (page still locked) */ -static noinline_for_stack int writepage_delalloc(struct inode *inode, +static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, u64 delalloc_start, unsigned long *nr_written) { @@ -3432,14 +3432,14 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - found = find_lock_delalloc_range(inode, page, + found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); if (!found) { delalloc_start = delalloc_end + 1; continue; } - ret = btrfs_run_delalloc_range(BTRFS_I(inode), page, delalloc_start, + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); if (ret) { SetPageError(page); @@ -3657,7 +3657,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); if (!epd->extent_locked) { - ret = writepage_delalloc(inode, page, wbc, start, &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, + &nr_written); if (ret == 1) return 0; if (ret) From c2566f22893c8b8cdf443505c043b2ca9f5023f6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:35 +0300 Subject: [PATCH 082/151] btrfs: make btrfs_set_extent_delalloc take btrfs_inode Preparation to make btrfs_dirty_pages take btrfs_inode as parameter. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 12 ++++++------ fs/btrfs/reflink.c | 3 ++- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/tests/inode-tests.c | 14 +++++++------- 6 files changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 65e0630c9373..794706f4c3ae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2926,7 +2926,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 760ddc11aa3f..b6c9921340f3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -546,7 +546,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, } } - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + err = btrfs_set_extent_delalloc(BTRFS_I(inode), start_pos, end_of_last_block, extra_bits, cached); if (err) return err; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5abb6d0a8cac..0d711e525dbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2260,13 +2260,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, return 0; } -int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); - return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - extra_bits, cached_state); + return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2363,7 +2363,7 @@ again: goto again; } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, page_end, 0, &cached_state); if (ret) goto out_reserved; @@ -4581,7 +4581,7 @@ again: EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), block_start, block_end, 0, &cached_state); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -8296,7 +8296,7 @@ again: EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 040009d1cc31..fe3e05b51691 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -84,7 +84,8 @@ static int copy_inline_to_page(struct inode *inode, clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, + 0, NULL); if (ret) goto out_unlock; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e3f3e2e70212..73300c7e3c52 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2754,8 +2754,8 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, - NULL); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, + page_end, 0, NULL); if (ret) { unlock_page(page); put_page(page); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 24a8c714f56c..894a63a92236 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -954,8 +954,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, - NULL); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, + BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL); if (ret) { test_err("btrfs_set_extent_delalloc returned %d", ret); goto out; @@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, 0, NULL); if (ret) { @@ -999,7 +999,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, 0, NULL); @@ -1017,7 +1017,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, 0, NULL); @@ -1035,7 +1035,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - ret = btrfs_set_extent_delalloc(inode, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { @@ -1069,7 +1069,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) * Refill the hole again just for good measure, because I thought it * might fail and I'd rather satisfy my paranoia at this point. */ - ret = btrfs_set_extent_delalloc(inode, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL); if (ret) { From 088545f6e442605d4b897456ef0b34eae06bdc07 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:36 +0300 Subject: [PATCH 083/151] btrfs: make btrfs_dirty_pages take btrfs_inode There is a single use of the generic vfs_inode so let's take btrfs_inode as a parameter and remove couple of redundant BTRFS_I() calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 24 ++++++++++++------------ fs/btrfs/free-space-cache.c | 5 +++-- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 794706f4c3ae..0ea3d89beb26 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3029,7 +3029,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct inode *inode, struct page **pages, +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b6c9921340f3..49013c6f2c14 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -500,18 +500,18 @@ next: * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -int btrfs_dirty_pages(struct inode *inode, struct page **pages, +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int err = 0; int i; u64 num_bytes; u64 start_pos; u64 end_of_last_block; u64 end_pos = pos + write_bytes; - loff_t isize = i_size_read(inode); + loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; start_pos = pos & ~((u64) fs_info->sectorsize - 1); @@ -524,13 +524,13 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, + clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); - if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (!btrfs_is_free_space_inode(inode)) { if (start_pos >= isize && - !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { + !(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case * so just set the delalloc new bit for the range @@ -538,15 +538,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, */ extra_bits |= EXTENT_DELALLOC_NEW; } else { - err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode), - start_pos, + err = btrfs_find_new_delalloc_bytes(inode, start_pos, num_bytes, cached); if (err) return err; } } - err = btrfs_set_extent_delalloc(BTRFS_I(inode), start_pos, end_of_last_block, + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); if (err) return err; @@ -564,7 +563,7 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, * at this time. */ if (end_pos > isize) - i_size_write(inode, end_pos); + i_size_write(&inode->vfs_inode, end_pos); return 0; } @@ -1795,8 +1794,9 @@ again: fs_info->sectorsize); if (copied > 0) - ret = btrfs_dirty_pages(inode, pages, dirty_pages, - pos, copied, &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, + dirty_pages, pos, copied, + &cached_state); /* * If we have not locked the extent range, because the range's diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6943bdd6fc62..6d961e11639e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1334,8 +1334,9 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0, - i_size_read(inode), &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, + io_ctl->num_pages, 0, i_size_read(inode), + &cached_state); if (ret) goto out_nospc; From 7661a3e033ab782366e0e1f4b6aad0df3555fcbd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:37 +0300 Subject: [PATCH 084/151] btrfs: make btrfs_qgroup_reserve_data take btrfs_inode There's only a single use of vfs_inode in a tracepoint so let's take btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 2 +- fs/btrfs/file.c | 7 ++++--- fs/btrfs/qgroup.c | 10 +++++----- fs/btrfs/qgroup.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 31440137accf..d3c43e41c275 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -253,7 +253,7 @@ int btrfs_check_data_free_space(struct inode *inode, return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, len); else diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 49013c6f2c14..8d3c62b81088 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3213,7 +3213,7 @@ reserve_space: &cached_state); if (ret) goto out; - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) goto out; @@ -3383,8 +3383,9 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, - cur_offset, last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), + &data_reserved, cur_offset, + last_byte - cur_offset); if (ret < 0) { cur_offset = last_byte; free_extent_map(em); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 01a2b03648be..657c09be159a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3435,11 +3435,11 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * same @reserved, caller must ensure when error happens it's OK * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset *reserved; @@ -3462,12 +3462,12 @@ int btrfs_qgroup_reserve_data(struct inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + ret = set_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; - trace_btrfs_qgroup_reserve_data(inode, start, len, + trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; @@ -3481,7 +3481,7 @@ cleanup: /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + clear_extent_bit(&inode->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); /* Also free data bytes of already reserved one */ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 66e80dc91242..2a232967f8e3 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -344,7 +344,7 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif /* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct btrfs_inode *inode, From 9db5d510ac5bfeaedd1e05e6afa300dda1ea7f4f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:38 +0300 Subject: [PATCH 085/151] btrfs: make btrfs_free_reserved_data_space_noquota take btrfs_fs_info No point in taking an inode only to get btrfs_fs_info from it, instead take btrfs_fs_info directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 7 +++---- fs/btrfs/delalloc-space.h | 2 +- fs/btrfs/inode.c | 6 ++---- fs/btrfs/relocation.c | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index d3c43e41c275..fc7496c58609 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -255,7 +255,7 @@ int btrfs_check_data_free_space(struct inode *inode, /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len); if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); else ret = 0; return ret; @@ -269,10 +269,9 @@ int btrfs_check_data_free_space(struct inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, +void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_space_info *data_sinfo; ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); @@ -300,7 +299,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, round_down(start, root->fs_info->sectorsize); start = round_down(start, root->fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(inode, len); + btrfs_free_reserved_data_space_noquota(root->fs_info, len); btrfs_qgroup_free_data(BTRFS_I(inode), reserved, start, len); } diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index fe8c6aafb25b..bf3beec1b276 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -13,7 +13,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, void btrfs_delalloc_release_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, +void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0d711e525dbb..f700e3897937 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2080,9 +2080,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota( - &inode->vfs_inode, - len); + btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -7312,7 +7310,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * use the existing or preallocated extent, so does not * need to adjust btrfs_space_info's bytes_may_use. */ - btrfs_free_reserved_data_space_noquota(inode, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); goto skip_cow; } } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 73300c7e3c52..4a2959ebece4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2613,7 +2613,7 @@ int prealloc_file_extent_cluster(struct inode *inode, inode_unlock(inode); if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space_noquota(inode, + btrfs_free_reserved_data_space_noquota(btrfs_sb(inode->i_sb), prealloc_end + 1 - cur_offset); return ret; } From 25ce28caaa1ddc2ef8848c5a09e63a9bc0a5d455 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:39 +0300 Subject: [PATCH 086/151] btrfs: make btrfs_free_reserved_data_space take btrfs_inode It only uses btrfs_inode internally so take it as a parameter. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 18 +++++++++--------- fs/btrfs/delalloc-space.h | 2 +- fs/btrfs/file.c | 13 +++++++------ fs/btrfs/inode.c | 6 +++--- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index fc7496c58609..8f872ae9732d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -289,18 +289,18 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ -void btrfs_free_reserved_data_space(struct inode *inode, +void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = inode->root->fs_info; /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->fs_info->sectorsize) - - round_down(start, root->fs_info->sectorsize); - start = round_down(start, root->fs_info->sectorsize); + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(root->fs_info, len); - btrfs_qgroup_free_data(BTRFS_I(inode), reserved, start, len); + btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_qgroup_free_data(inode, reserved, start, len); } /** @@ -563,7 +563,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); if (ret < 0) - btrfs_free_reserved_data_space(inode, *reserved, start, len); + btrfs_free_reserved_data_space(BTRFS_I(inode), *reserved, start, len); return ret; } @@ -584,5 +584,5 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len, bool qgroup_free) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); - btrfs_free_reserved_data_space(inode, reserved, start, len); + btrfs_free_reserved_data_space(BTRFS_I(inode), reserved, start, len); } diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index bf3beec1b276..57194275923d 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -8,7 +8,7 @@ struct extent_changeset; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, +void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, struct extent_changeset *reserved, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8d3c62b81088..96f2238a361b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1710,7 +1710,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, pos, write_bytes); else @@ -3232,7 +3232,7 @@ reserve_space: ret = btrfs_fallocate_update_isize(inode, offset + len, mode); out: if (ret && space_reserved) - btrfs_free_reserved_data_space(inode, data_reserved, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, alloc_start, bytes_to_reserve); extent_changeset_free(data_reserved); @@ -3397,8 +3397,9 @@ static long btrfs_fallocate(struct file *file, int mode, * range, free reserved data space first, otherwise * it'll result in false ENOSPC error. */ - btrfs_free_reserved_data_space(inode, data_reserved, - cur_offset, last_byte - cur_offset); + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -3415,7 +3416,7 @@ static long btrfs_fallocate(struct file *file, int mode, range->len, i_blocksize(inode), offset + len, &alloc_hint); else - btrfs_free_reserved_data_space(inode, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, range->start, range->len); list_del(&range->list); @@ -3436,7 +3437,7 @@ out: inode_unlock(inode); /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) - btrfs_free_reserved_data_space(inode, data_reserved, + btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f700e3897937..59e04bec4b35 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4532,8 +4532,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize); if (ret < 0) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, data_reserved, - block_start, blocksize); + btrfs_free_reserved_data_space(BTRFS_I(inode), + data_reserved, block_start, blocksize); goto out; } again: @@ -9772,7 +9772,7 @@ next: btrfs_end_transaction(trans); } if (clear_offset < end) - btrfs_free_reserved_data_space(inode, NULL, clear_offset, + btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, end - clear_offset + 1); return ret; } From 86d52921a2ba51a78e5bfb71c75aedcbd9e61a5c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:40 +0300 Subject: [PATCH 087/151] btrfs: make btrfs_delalloc_release_space take btrfs_inode It needs btrfs_inode so take it as a parameter directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 6 +++--- fs/btrfs/delalloc-space.h | 2 +- fs/btrfs/file.c | 5 +++-- fs/btrfs/inode.c | 26 ++++++++++++++------------ fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/reflink.c | 4 ++-- 6 files changed, 25 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 8f872ae9732d..aabc883489c1 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -579,10 +579,10 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * list if there are no delalloc bytes left. * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, +void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); - btrfs_free_reserved_data_space(BTRFS_I(inode), reserved, start, len); + btrfs_delalloc_release_metadata(inode, len, qgroup_free); + btrfs_free_reserved_data_space(inode, reserved, start, len); } diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 57194275923d..656807a8afa6 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -10,7 +10,7 @@ int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, +void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 96f2238a361b..f6c5d94f30f9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1784,7 +1784,7 @@ again: __pos = round_down(pos, fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, __pos, release_bytes, true); } @@ -1850,7 +1850,8 @@ again: btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes, true); } else { - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, round_down(pos, fs_info->sectorsize), release_bytes, true); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59e04bec4b35..5a1dfbb45734 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2328,7 +2328,8 @@ again: if (!ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, PAGE_SIZE, true); } @@ -2378,8 +2379,8 @@ again: out_reserved: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); if (free_delalloc_space) - btrfs_delalloc_release_space(inode, data_reserved, page_start, - PAGE_SIZE, true); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + page_start, PAGE_SIZE, true); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); out_page: @@ -4539,7 +4540,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; @@ -4615,7 +4616,7 @@ out_unlock: btrfs_delalloc_release_metadata(BTRFS_I(inode), blocksize, true); else - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, block_start, blocksize, true); } btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); @@ -7947,8 +7948,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, data_reserved, - offset, dio_data.reserve, true); + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, offset, dio_data.reserve, + true); /* * On error we might have left some ordered extents * without submitting corresponding bios for them, so @@ -7963,7 +7965,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.unsubmitted_oe_range_start, false); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, offset, count - (size_t)ret, true); btrfs_delalloc_release_extents(BTRFS_I(inode), count); } @@ -8277,9 +8279,9 @@ again: fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(inode, data_reserved, - page_start, PAGE_SIZE - reserved_space, - true); + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); } } @@ -8334,7 +8336,7 @@ out_unlock: unlock_page(page); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(inode, data_reserved, page_start, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: sb_end_pagefault(inode->i_sb); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b3e4c632d80c..90083fb04928 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1333,7 +1333,7 @@ again: spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, start_index << PAGE_SHIFT, (page_cnt - i_done) << PAGE_SHIFT, true); } @@ -1361,7 +1361,7 @@ out: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_space(inode, data_reserved, + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT, true); btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index fe3e05b51691..9da0f101548f 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -134,8 +134,8 @@ out_unlock: put_page(page); } if (ret) - btrfs_delalloc_release_space(inode, data_reserved, file_offset, - block_size, true); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, + file_offset, block_size, true); btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); out: extent_changeset_free(data_reserved); From 36ea6f3e931391c2adbb38af8c5dd4a043d26ac5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:41 +0300 Subject: [PATCH 088/151] btrfs: make btrfs_check_data_free_space take btrfs_inode Instead of calling BTRFS_I on the passed vfs_inode take btrfs_inode directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 ++- fs/btrfs/delalloc-space.c | 10 +++++----- fs/btrfs/delalloc-space.h | 2 +- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 4 ++-- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 31ca2cfb7e3e..6ade9d345e66 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2508,7 +2508,8 @@ again: num_pages *= 16; num_pages *= PAGE_SIZE; - ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages); + ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, + num_pages); if (ret) goto out_put; diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index aabc883489c1..0073123342d5 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -237,10 +237,10 @@ commit_trans: return 0; } -int btrfs_check_data_free_space(struct inode *inode, +int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; /* align the range */ @@ -248,12 +248,12 @@ int btrfs_check_data_free_space(struct inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); + ret = btrfs_alloc_data_chunk_ondemand(inode, len); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len); + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(fs_info, len); else @@ -558,7 +558,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, { int ret; - ret = btrfs_check_data_free_space(inode, reserved, start, len); + ret = btrfs_check_data_free_space(BTRFS_I(inode), reserved, start, len); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 656807a8afa6..582ba26dbe6c 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -6,7 +6,7 @@ struct extent_changeset; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -int btrfs_check_data_free_space(struct inode *inode, +int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f6c5d94f30f9..841c516079a9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1681,7 +1681,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, fs_info->sectorsize); extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(inode, &data_reserved, pos, + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &data_reserved, pos, write_bytes); if (ret < 0) { if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5a1dfbb45734..1cc2c68206a7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4519,8 +4519,8 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_end = block_start + blocksize - 1; - ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, - blocksize); + ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, + block_start, blocksize); if (ret < 0) { if (btrfs_check_nocow_lock(BTRFS_I(inode), block_start, &write_bytes) > 0) { From e5b7231e2009967b63a39ce0ec6a022307616b82 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:42 +0300 Subject: [PATCH 089/151] btrfs: make btrfs_delalloc_reserve_space take btrfs_inode All of its children take btrfs_inode so bubble up this requirement to btrfs_delalloc_reserve_space's interface and stop calling BTRFS_I internally. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 8 ++++---- fs/btrfs/delalloc-space.h | 2 +- fs/btrfs/inode-map.c | 3 ++- fs/btrfs/inode.c | 11 +++++------ fs/btrfs/ioctl.c | 2 +- fs/btrfs/reflink.c | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0073123342d5..0e354e9e57d0 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -553,17 +553,17 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) * Return 0 for success * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, +int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(BTRFS_I(inode), reserved, start, len); + ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + ret = btrfs_delalloc_reserve_metadata(inode, len); if (ret < 0) - btrfs_free_reserved_data_space(BTRFS_I(inode), *reserved, start, len); + btrfs_free_reserved_data_space(inode, *reserved, start, len); return ret; } diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 582ba26dbe6c..28bf5c3ef430 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -17,7 +17,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); -int btrfs_delalloc_reserve_space(struct inode *inode, +int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 6009e0e939b5..76d2e43817ea 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,7 +495,8 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, 0, + prealloc); if (ret) goto out_put; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1cc2c68206a7..cd092a7a69cc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2297,8 +2297,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. */ - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, PAGE_SIZE); again: lock_page(page); @@ -4518,7 +4518,6 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; - ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, block_start, blocksize); if (ret < 0) { @@ -7916,7 +7915,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, offset, count); if (ret) goto out; @@ -8231,8 +8230,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - reserved_space); + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); if (!ret2) { ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 90083fb04928..4adfdfa28e53 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1243,7 +1243,7 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); if (ret) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9da0f101548f..834eb6d98caa 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -68,8 +68,8 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, - block_size); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + file_offset, block_size); if (ret) goto out; From 65d87f7918ef11a4040f87393cba7b8dff0d9fc8 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 5 Jun 2020 10:51:51 +0300 Subject: [PATCH 090/151] btrfs: remove BTRFS_I calls in btrfs_writepage_fixup_worker All of its children functions use btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cd092a7a69cc..0a038834bf71 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2281,7 +2281,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page; - struct inode *inode; + struct btrfs_inode *inode; u64 page_start; u64 page_end; int ret = 0; @@ -2289,7 +2289,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; - inode = fixup->inode; + inode = BTRFS_I(fixup->inode); page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; @@ -2297,8 +2297,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, PAGE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); again: lock_page(page); @@ -2326,10 +2326,8 @@ again: * when the page was already properly dealt with. */ if (!ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, + btrfs_delalloc_release_extents(inode, PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); } @@ -2344,25 +2342,23 @@ again: if (ret) goto out_page; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - &cached_state); + lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) goto out_reserved; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, - page_end, &cached_state); + unlock_extent_cached(&inode->io_tree, page_start, page_end, + &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, page_end, 0, + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); if (ret) goto out_reserved; @@ -2377,11 +2373,11 @@ again: BUG_ON(!PageDirty(page)); free_delalloc_space = false; out_reserved: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - page_start, PAGE_SIZE, true); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, + btrfs_delalloc_release_space(inode, data_reserved, page_start, + PAGE_SIZE, true); + unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { @@ -2404,7 +2400,7 @@ out_page: * that could need flushing space. Recursing back to fixup worker would * deadlock. */ - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(&inode->vfs_inode); } /* From 056d9beca3e5052134f9834dd2a159f3f113981c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:44 +0300 Subject: [PATCH 091/151] btrfs: make prealloc_file_extent_cluster take btrfs_inode The vfs inode is only used for a pair of inode_lock/unlock calls all other uses call for btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4a2959ebece4..523d2e5fab8f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2571,14 +2571,14 @@ out_free_blocks: return err; } -static noinline_for_stack -int prealloc_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) +static noinline_for_stack int prealloc_file_extent_cluster( + struct btrfs_inode *inode, + struct file_extent_cluster *cluster) { u64 alloc_hint = 0; u64 start; u64 end; - u64 offset = BTRFS_I(inode)->index_cnt; + u64 offset = inode->index_cnt; u64 num_bytes; int nr; int ret = 0; @@ -2587,12 +2587,12 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 cur_offset = prealloc_start; BUG_ON(cluster->start != cluster->boundary[0]); - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + ret = btrfs_alloc_data_chunk_ondemand(inode, prealloc_end + 1 - prealloc_start); if (ret) return ret; - inode_lock(inode); + inode_lock(&inode->vfs_inode); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -2600,20 +2600,20 @@ int prealloc_file_extent_cluster(struct inode *inode, else end = cluster->end - offset; - lock_extent(&BTRFS_I(inode)->io_tree, start, end); + lock_extent(&inode->io_tree, start, end); num_bytes = end + 1 - start; - ret = btrfs_prealloc_file_range(inode, 0, start, + ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&BTRFS_I(inode)->io_tree, start, end); + unlock_extent(&inode->io_tree, start, end); if (ret) break; } - inode_unlock(inode); + inode_unlock(&inode->vfs_inode); if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space_noquota(btrfs_sb(inode->i_sb), + btrfs_free_reserved_data_space_noquota(inode->root->fs_info, prealloc_end + 1 - cur_offset); return ret; } @@ -2682,7 +2682,7 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - ret = prealloc_file_extent_cluster(inode, cluster); + ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster); if (ret) goto out; From d90944141b4a7704fac7f2b8fcc566511176e4b4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 5 Jun 2020 10:41:13 +0300 Subject: [PATCH 092/151] btrfs: make btrfs_set_inode_last_trans take btrfs_inode Instead of making multiple calls to BTRFS_I simply take btrfs_inode as an input paramter. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- fs/btrfs/transaction.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0a038834bf71..07dd8aa4f708 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3481,7 +3481,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, fill_inode_item(trans, leaf, inode_item, inode); btrfs_mark_buffer_dirty(leaf); - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); ret = 0; failed: btrfs_free_path(path); @@ -3511,7 +3511,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); return ret; } @@ -6053,7 +6053,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_tree_add(inode); trace_btrfs_inode_new(inode); - btrfs_set_inode_last_trans(trans, inode); + btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); btrfs_update_root_times(trans, root); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index bf102e64bfb2..6f65fff6cf50 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -156,13 +156,13 @@ struct btrfs_pending_snapshot { }; static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_trans = trans->transaction->transid; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->last_trans = trans->transaction->transid; + inode->last_sub_trans = inode->root->log_transid; + inode->last_log_commit = inode->root->last_log_commit; + spin_unlock(&inode->lock); } /* From cfdd45921571eb24073e0737fa0bd44b4218f914 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Jun 2020 08:55:46 +0300 Subject: [PATCH 093/151] btrfs: make btrfs_qgroup_check_reserved_leak take btrfs_inode vfs_inode is used only for the inode number everything else requires btrfs_inode. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ use btrfs_ino ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- fs/btrfs/qgroup.c | 14 +++++++------- fs/btrfs/qgroup.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 07dd8aa4f708..5d2ce8092531 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8631,7 +8631,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(inode); + btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); inode_tree_del(inode); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 657c09be159a..787128d7e196 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3784,7 +3784,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) * Check qgroup reserved space leaking, normally at destroy inode * time */ -void btrfs_qgroup_check_reserved_leak(struct inode *inode) +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) { struct extent_changeset changeset; struct ulist_node *unode; @@ -3792,19 +3792,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { ULIST_ITER_INIT(&iter); while ((unode = ulist_next(&changeset.range_changed, &iter))) { - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", - inode->i_ino, unode->val, unode->aux); + btrfs_warn(inode->root->fs_info, + "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", + btrfs_ino(inode), unode->val, unode->aux); } - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->root_key.objectid, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 2a232967f8e3..f7d6f06ab555 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -399,7 +399,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); -void btrfs_qgroup_check_reserved_leak(struct inode *inode); +void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ void btrfs_qgroup_init_swapped_blocks( From 06f67c47076e5c3ee65276171596479dcc3a3941 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:14 +0800 Subject: [PATCH 094/151] btrfs: use __u16 for the return value of btrfs_qgroup_level() The qgroup level is limited to u16, so no need to use u64 for it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 +- include/uapi/linux/btrfs_tree.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 787128d7e196..229e461dbfc3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -538,7 +538,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) if (qgroup->rsv.values[i]) { ret = true; btrfs_warn(fs_info, - "qgroup %llu/%llu has unreleased space, type %d rsv %llu", + "qgroup %hu/%llu has unreleased space, type %d rsv %llu", btrfs_qgroup_level(qgroup->qgroupid), btrfs_qgroup_subvolid(qgroup->qgroupid), i, qgroup->rsv.values[i]); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index a3f3975df0de..9ba64ca6b4ac 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -913,9 +913,9 @@ struct btrfs_free_space_info { #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) #define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline __u64 btrfs_qgroup_level(__u64 qgroupid) +static inline __u16 btrfs_qgroup_level(__u64 qgroupid) { - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; + return (__u16)(qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT); } /* From 49e5fb46211de0744170928466e68166eabbd8b3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 28 Jun 2020 13:07:15 +0800 Subject: [PATCH 095/151] btrfs: qgroup: export qgroups in sysfs This patch will add the following sysfs interface: /sys/fs/btrfs//qgroups//referenced /sys/fs/btrfs//qgroups//exclusive /sys/fs/btrfs//qgroups//max_referenced /sys/fs/btrfs//qgroups//max_exclusive /sys/fs/btrfs//qgroups//limit_flags Which is also available in output of "btrfs qgroup show". /sys/fs/btrfs//qgroups//rsv_data /sys/fs/btrfs//qgroups//rsv_meta_pertrans /sys/fs/btrfs//qgroups//rsv_meta_prealloc The last 3 rsv related members are not visible to users, but can be very useful to debug qgroup limit related bugs. Also, to avoid '/' used in , the separator between qgroup level and qgroup id is changed to '_'. The interface is not hidden behind 'debug' as we want this interface to be included into production build and to provide another way to read the qgroup information besides the ioctls. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/qgroup.c | 44 +++++++++++--- fs/btrfs/qgroup.h | 11 ++++ fs/btrfs/sysfs.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/sysfs.h | 7 +++ 5 files changed, 202 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0ea3d89beb26..2b0d95a4b0da 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -779,6 +779,7 @@ struct btrfs_fs_info { u32 thread_pool_size; struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; u64 total_pinned; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 229e461dbfc3..ee0ad33b659c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -22,6 +22,7 @@ #include "extent_io.h" #include "qgroup.h" #include "block-group.h" +#include "sysfs.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -220,10 +221,12 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) +static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) { struct btrfs_qgroup_list *list; + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -252,7 +255,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) return -ENOENT; rb_erase(&qgroup->node, &fs_info->qgroup_tree); - __del_qgroup_rb(qgroup); + __del_qgroup_rb(fs_info, qgroup); return 0; } @@ -351,6 +354,9 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; /* default this to quota off, in case no status key is found */ fs_info->qgroup_flags = 0; @@ -412,6 +418,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) goto out; } } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + switch (found_key.type) { case BTRFS_QGROUP_INFO_KEY: { struct btrfs_qgroup_info_item *ptr; @@ -500,16 +510,12 @@ out: ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + btrfs_sysfs_del_qgroups(fs_info); } return ret < 0 ? ret : 0; } -static u64 btrfs_qgroup_subvolid(u64 qgroupid) -{ - return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); -} - /* * Called in close_ctree() when quota is still enabled. This verifies we don't * leak some reserved space. @@ -562,7 +568,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - __del_qgroup_rb(qgroup); + __del_qgroup_rb(fs_info, qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting @@ -571,6 +577,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) */ ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; + btrfs_sysfs_del_qgroups(fs_info); } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, @@ -943,6 +950,9 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out; } + ret = btrfs_sysfs_add_qgroups(fs_info); + if (ret < 0) + goto out; /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -1030,6 +1040,11 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { @@ -1054,6 +1069,11 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } ret = btrfs_commit_transaction(trans); trans = NULL; @@ -1089,6 +1109,7 @@ out: fs_info->qgroup_ulist = NULL; if (trans) btrfs_end_transaction(trans); + btrfs_sysfs_del_qgroups(fs_info); } mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -1441,8 +1462,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) qgroup = add_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); - if (IS_ERR(qgroup)) + if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); + goto out; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -2861,6 +2885,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); + if (!ret) + ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index f7d6f06ab555..50dea9a2d8fb 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -8,6 +8,7 @@ #include #include +#include #include "ulist.h" #include "delayed-ref.h" @@ -223,8 +224,18 @@ struct btrfs_qgroup { */ u64 old_refcnt; u64 new_refcnt; + + /* + * Sysfs kobjectid + */ + struct kobject kobj; }; +static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) +{ + return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); +} + /* * For qgroup event trace points only */ diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a39bff64ff24..5885abe57c3e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -19,6 +19,7 @@ #include "volumes.h" #include "space-info.h" #include "block-group.h" +#include "qgroup.h" struct btrfs_feature_attr { struct kobj_attribute kobj_attr; @@ -1455,6 +1456,153 @@ failure: return error; } +static inline struct btrfs_fs_info *qgroup_kobj_to_fs_info(struct kobject *kobj) +{ + return to_fs_info(kobj->parent->parent); +} + +#define QGROUP_ATTR(_member, _show_name) \ +static ssize_t btrfs_qgroup_show_##_member(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->_member, &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, _show_name, btrfs_qgroup_show_##_member) + +#define QGROUP_RSV_ATTR(_name, _type) \ +static ssize_t btrfs_qgroup_rsv_show_##_name(struct kobject *qgroup_kobj, \ + struct kobj_attribute *a, \ + char *buf) \ +{ \ + struct btrfs_fs_info *fs_info = qgroup_kobj_to_fs_info(qgroup_kobj); \ + struct btrfs_qgroup *qgroup = container_of(qgroup_kobj, \ + struct btrfs_qgroup, kobj); \ + return btrfs_show_u64(&qgroup->rsv.values[_type], \ + &fs_info->qgroup_lock, buf); \ +} \ +BTRFS_ATTR(qgroup, rsv_##_name, btrfs_qgroup_rsv_show_##_name) + +QGROUP_ATTR(rfer, referenced); +QGROUP_ATTR(excl, exclusive); +QGROUP_ATTR(max_rfer, max_referenced); +QGROUP_ATTR(max_excl, max_exclusive); +QGROUP_ATTR(lim_flags, limit_flags); +QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); +QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); +QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); + +static struct attribute *qgroup_attrs[] = { + BTRFS_ATTR_PTR(qgroup, referenced), + BTRFS_ATTR_PTR(qgroup, exclusive), + BTRFS_ATTR_PTR(qgroup, max_referenced), + BTRFS_ATTR_PTR(qgroup, max_exclusive), + BTRFS_ATTR_PTR(qgroup, limit_flags), + BTRFS_ATTR_PTR(qgroup, rsv_data), + BTRFS_ATTR_PTR(qgroup, rsv_meta_pertrans), + BTRFS_ATTR_PTR(qgroup, rsv_meta_prealloc), + NULL +}; +ATTRIBUTE_GROUPS(qgroup); + +static void qgroup_release(struct kobject *kobj) +{ + struct btrfs_qgroup *qgroup = container_of(kobj, struct btrfs_qgroup, kobj); + + memset(&qgroup->kobj, 0, sizeof(*kobj)); +} + +static struct kobj_type qgroup_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = qgroup_release, + .default_groups = qgroup_groups, +}; + +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + struct kobject *qgroups_kobj = fs_info->qgroups_kobj; + int ret; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + if (qgroup->kobj.state_initialized) + return 0; + if (!qgroups_kobj) + return -EINVAL; + + ret = kobject_init_and_add(&qgroup->kobj, &qgroup_ktype, qgroups_kobj, + "%hu_%llu", btrfs_qgroup_level(qgroup->qgroupid), + btrfs_qgroup_subvolid(qgroup->qgroupid)); + if (ret < 0) + kobject_put(&qgroup->kobj); + + return ret; +} + +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kobject_del(fs_info->qgroups_kobj); + kobject_put(fs_info->qgroups_kobj); + fs_info->qgroups_kobj = NULL; +} + +/* Called when qgroups get initialized, thus there is no need for locking */ +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) +{ + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *next; + int ret = 0; + + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return 0; + + ASSERT(fsid_kobj); + if (fs_info->qgroups_kobj) + return 0; + + fs_info->qgroups_kobj = kobject_create_and_add("qgroups", fsid_kobj); + if (!fs_info->qgroups_kobj) { + ret = -ENOMEM; + goto out; + } + rbtree_postorder_for_each_entry_safe(qgroup, next, + &fs_info->qgroup_tree, node) { + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) + goto out; + } + +out: + if (ret < 0) + btrfs_sysfs_del_qgroups(fs_info); + return ret; +} + +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) +{ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + return; + + if (qgroup->kobj.state_initialized) { + kobject_del(&qgroup->kobj); + kobject_put(&qgroup->kobj); + } +} /* * Change per-fs features in /sys/fs/btrfs/UUID/features to match current diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 718a26c97833..cf839c46a131 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -36,4 +36,11 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); void btrfs_sysfs_update_devid(struct btrfs_device *device); +int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); +void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup); + #endif From 851fd730a743e072badaf67caf39883e32439431 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 16 Jun 2020 10:17:34 +0800 Subject: [PATCH 096/151] btrfs: don't allocate anonymous block device for user invisible roots [BUG] When a lot of subvolumes are created, there is a user report about transaction aborted: BTRFS: Transaction aborted (error -24) WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] Call Trace: create_pending_snapshots+0x82/0xa0 [btrfs] btrfs_commit_transaction+0x275/0x8c0 [btrfs] btrfs_mksubvol+0x4b9/0x500 [btrfs] btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] btrfs_ioctl+0x11a4/0x2da0 [btrfs] do_vfs_ioctl+0xa9/0x640 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 33f2f83f3d5250e9 ]--- BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown BTRFS info (device sda1): forced readonly BTRFS warning (device sda1): Skipping commit of aborted transaction. BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown [CAUSE] The error is EMFILE (Too many files open) and comes from the anonymous block device allocation. The ids are in a shared pool of size 1<<20. The ids are assigned to live subvolumes, ie. the root structure exists in memory (eg. after creation or after the root appears in some path). The pool could be exhausted if the numbers are not reclaimed fast enough, after subvolume deletion or if other system component uses the anon block devices. [WORKAROUND] Since it's not possible to completely solve the problem, we can only minimize the time the id is allocated to a subvolume root. Firstly, we can reduce the use of anon_dev by trees that are not subvolume roots, like data reloc tree. This patch will do extra check on root objectid, to skip roots that don't need anon_dev. Currently it's only data reloc tree and orphan roots. Reported-by: Greed Rong Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dbf90cd49513..c90edf04a9db 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1424,9 +1424,16 @@ static int btrfs_init_fs_root(struct btrfs_root *root) spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); - ret = get_anon_bdev(&root->anon_dev); - if (ret) - goto fail; + /* + * Don't assign anonymous block device to roots that are not exposed to + * userspace, the id pool is limited to 1M + */ + if (is_fstree(root->root_key.objectid) && + btrfs_root_refs(&root->root_item) > 0) { + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + } mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, From 082b6c970f02fefd278c7833880cda29691a5f34 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 16 Jun 2020 10:17:37 +0800 Subject: [PATCH 097/151] btrfs: free anon block device right after subvolume deletion [BUG] When a lot of subvolumes are created, there is a user report about transaction aborted caused by slow anonymous block device reclaim: BTRFS: Transaction aborted (error -24) WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] Call Trace: create_pending_snapshots+0x82/0xa0 [btrfs] btrfs_commit_transaction+0x275/0x8c0 [btrfs] btrfs_mksubvol+0x4b9/0x500 [btrfs] btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] btrfs_ioctl+0x11a4/0x2da0 [btrfs] do_vfs_ioctl+0xa9/0x640 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 33f2f83f3d5250e9 ]--- BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown BTRFS info (device sda1): forced readonly BTRFS warning (device sda1): Skipping commit of aborted transaction. BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown [CAUSE] The anonymous device pool is shared and its size is 1M. It's possible to hit that limit if the subvolume deletion is not fast enough and the subvolumes to be cleaned keep the ids allocated. [WORKAROUND] We can't avoid the anon device pool exhaustion but we can shorten the time the id is attached to the subvolume root once the subvolume becomes invisible to the user. Reported-by: Greed Rong Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5d2ce8092531..f066cad2d039 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4026,6 +4026,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) } } + free_anon_bdev(dest->anon_dev); + dest->anon_dev = 0; out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; From 2dfb1e43f57dd3aeaa66f7cf05d068db2d4c8788 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 16 Jun 2020 10:17:36 +0800 Subject: [PATCH 098/151] btrfs: preallocate anon block device at first phase of snapshot creation [BUG] When the anonymous block device pool is exhausted, subvolume/snapshot creation fails with EMFILE (Too many files open). This has been reported by a user. The allocation happens in the second phase during transaction commit where it's only way out is to abort the transaction BTRFS: Transaction aborted (error -24) WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs] RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs] Call Trace: create_pending_snapshots+0x82/0xa0 [btrfs] btrfs_commit_transaction+0x275/0x8c0 [btrfs] btrfs_mksubvol+0x4b9/0x500 [btrfs] btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] btrfs_ioctl+0x11a4/0x2da0 [btrfs] do_vfs_ioctl+0xa9/0x640 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 33f2f83f3d5250e9 ]--- BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown BTRFS info (device sda1): forced readonly BTRFS warning (device sda1): Skipping commit of aborted transaction. BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown [CAUSE] When the global anonymous block device pool is exhausted, the following call chain will fail, and lead to transaction abort: btrfs_ioctl_snap_create_v2() |- btrfs_ioctl_snap_create_transid() |- btrfs_mksubvol() |- btrfs_commit_transaction() |- create_pending_snapshot() |- btrfs_get_fs_root() |- btrfs_init_fs_root() |- get_anon_bdev() [FIX] Although we can't enlarge the anonymous block device pool, at least we can preallocate anon_dev for subvolume/snapshot in the first phase, outside of transaction context and exactly at the moment the user calls the creation ioctl. Reported-by: Greed Rong Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 71 +++++++++++++++++++++++++++++++++++++----- fs/btrfs/disk-io.h | 2 ++ fs/btrfs/ioctl.c | 21 ++++++++++++- fs/btrfs/transaction.c | 2 +- fs/btrfs/transaction.h | 2 ++ 5 files changed, 89 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c90edf04a9db..9b307034d32c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1391,7 +1391,12 @@ alloc_fail: goto out; } -static int btrfs_init_fs_root(struct btrfs_root *root) +/* + * Initialize subvolume root in-memory structure + * + * @anon_dev: anonymous device to attach to the root, if zero, allocate new + */ +static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { int ret; unsigned int nofs_flag; @@ -1430,9 +1435,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root) */ if (is_fstree(root->root_key.objectid) && btrfs_root_refs(&root->root_item) > 0) { - ret = get_anon_bdev(&root->anon_dev); - if (ret) - goto fail; + if (!anon_dev) { + ret = get_anon_bdev(&root->anon_dev); + if (ret) + goto fail; + } else { + root->anon_dev = anon_dev; + } } mutex_lock(&root->objectid_mutex); @@ -1537,8 +1546,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) } -struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, bool check_ref) +/* + * Get an in-memory reference of a root structure. + * + * For essential trees like root/extent tree, we grab it from fs_info directly. + * For subvolume trees, we check the cached filesystem roots first. If not + * found, then read it from disk and add it to cached fs roots. + * + * Caller should release the root by calling btrfs_put_root() after the usage. + * + * NOTE: Reloc and log trees can't be read by this function as they share the + * same root objectid. + * + * @objectid: root id + * @anon_dev: preallocated anonymous block device number for new roots, + * pass 0 for new allocation. + * @check_ref: whether to check root item references, If true, return -ENOENT + * for orphan roots + */ +static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev, + bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; @@ -1567,6 +1595,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { + /* Shouldn't get preallocated anon_dev for cached roots */ + ASSERT(!anon_dev); if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); return ERR_PTR(-ENOENT); @@ -1586,7 +1616,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root); + ret = btrfs_init_fs_root(root, anon_dev); if (ret) goto fail; @@ -1619,6 +1649,33 @@ fail: return ERR_PTR(ret); } +/* + * Get in-memory reference of a root structure + * + * @objectid: tree objectid + * @check_ref: if set, verify that the tree exists and the item has at least + * one reference + */ +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, bool check_ref) +{ + return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); +} + +/* + * Get in-memory reference of a root structure, created as new, optionally pass + * the anonymous block device id + * + * @objectid: tree objectid + * @anon_dev: if zero, allocate a new anonymous block device or use the + * parameter value + */ +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev) +{ + return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); +} + static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bf43245406c4..00dc39d47ed3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); +struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, + u64 objectid, dev_t anon_dev); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4adfdfa28e53..ab34179d7cbc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -566,6 +566,7 @@ static noinline int create_subvol(struct inode *dir, struct inode *inode; int ret; int err; + dev_t anon_dev = 0; u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; @@ -578,6 +579,10 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail_free; + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto fail_free; + /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. @@ -660,12 +665,15 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_get_fs_root(fs_info, objectid, true); + new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { + free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); goto fail; } + /* Freeing will be done in btrfs_put_root() of new_root */ + anon_dev = 0; btrfs_record_root_in_trans(trans, new_root); @@ -735,6 +743,8 @@ fail: return ret; fail_free: + if (anon_dev) + free_anon_bdev(anon_dev); kfree(root_item); return ret; } @@ -762,6 +772,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!pending_snapshot) return -ENOMEM; + ret = get_anon_bdev(&pending_snapshot->anon_dev); + if (ret < 0) + goto free_pending; pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), GFP_KERNEL); pending_snapshot->path = btrfs_alloc_path(); @@ -823,10 +836,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; + pending_snapshot->anon_dev = 0; fail: + /* Prevent double freeing of anon_dev */ + if (ret && pending_snapshot->snap) + pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); free_pending: + if (pending_snapshot->anon_dev) + free_anon_bdev(pending_snapshot->anon_dev); kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); kfree(pending_snapshot); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b359d4b17658..3a7c392fbcc9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1630,7 +1630,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_fs_root(fs_info, objectid, true); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6f65fff6cf50..98f7860af5e9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -151,6 +151,8 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reservation for relocation */ int error; + /* Preallocated anonymous block device number */ + dev_t anon_dev; bool readonly; struct list_head list; }; From 8c8648dd1f6d62aeb912deeb788b6ac33cb782e7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:31:59 +0100 Subject: [PATCH 099/151] btrfs: only commit the delayed inode when doing a full fsync Commit 2c2c452b0cafdc ("Btrfs: fix fsync when extend references are added to an inode") forced a commit of the delayed inode when logging an inode in order to ensure we would end up logging the inode item during a full fsync. By committing the delayed inode, we updated the inode item in the fs/subvolume tree and then later when copying items from leafs modified in the current transaction into the log tree (with copy_inode_items_to_log()) we ended up copying the inode item from the fs/subvolume tree into the log tree. Logging an up to date version of the inode item is required to make sure at log replay time we get the link count fixup triggered among other things (replay xattr deletes, etc). The test case generic/040 from fstests exercises the bug which that commit fixed. However for a fast fsync we don't need to commit the delayed inode because we always log an up to date version of the inode item based on the struct btrfs_inode we have in-memory. We started doing this for fast fsyncs since commit e4545de5b035c7 ("Btrfs: fix fsync data loss after append write"). So just stop committing the delayed inode if we are doing a fast fsync, we are only wasting time and adding contention on fs/subvolume tree. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index df6d4e3e40b1..44bbf8919883 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5130,7 +5130,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; int err = 0; - int ret; + int ret = 0; bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &inode->extent_tree; @@ -5167,14 +5167,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, /* * Only run delayed items if we are a dir or a new file. - * Otherwise commit the delayed inode only, which is needed in - * order for the log replay code to mark inodes for link count - * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + * Otherwise commit the delayed inode only if the full sync flag is set, + * as we want to make sure an up to date version is in the subvolume + * tree so copy_inode_items_to_log() / copy_items() can find it and copy + * it to the log tree. For a non full sync, we always log the inode item + * based on the in-memory struct btrfs_inode which is always up to date. */ if (S_ISDIR(inode->vfs_inode.i_mode) || inode->generation > fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - else + else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); if (ret) { From 5aa7d1a7f4a2f8ca6be1f32415e9365d026e8fa7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:32:20 +0100 Subject: [PATCH 100/151] btrfs: only commit delayed items at fsync if we are logging a directory When logging an inode we are committing its delayed items if either the inode is a directory or if it is a new inode, created in the current transaction. We need to do it for directories, since new directory indexes are stored as delayed items of the inode and when logging a directory we need to be able to access all indexes from the fs/subvolume tree in order to figure out which index ranges need to be logged. However for new inodes that are not directories, we do not need to do it because the only type of delayed item they can have is the inode item, and we are guaranteed to always log an up to date version of the inode item: *) for a full fsync we do it by committing the delayed inode and then copying the item from the fs/subvolume tree with copy_inode_items_to_log(); *) for a fast fsync we always log the inode item based on the contents of the in-memory struct btrfs_inode. We guarantee this is always done since commit e4545de5b035c7 ("Btrfs: fix fsync data loss after append write"). So stop running delayed items for a new inodes that are not directories, since that forces committing the delayed inode into the fs/subvolume tree, wasting time and adding contention to the tree when a full fsync is not required. We will only do it in case a fast fsync is needed. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 44bbf8919883..7c325451d47f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5123,7 +5123,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, const loff_t end, struct btrfs_log_ctx *ctx) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_path *dst_path; struct btrfs_key min_key; @@ -5166,15 +5165,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.offset = (u64)-1; /* - * Only run delayed items if we are a dir or a new file. + * Only run delayed items if we are a directory. We want to make sure + * all directory indexes hit the fs/subvolume tree so we can find them + * and figure out which index ranges have to be logged. + * * Otherwise commit the delayed inode only if the full sync flag is set, * as we want to make sure an up to date version is in the subvolume * tree so copy_inode_items_to_log() / copy_items() can find it and copy * it to the log tree. For a non full sync, we always log the inode item * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode) || - inode->generation > fs_info->last_trans_committed) + if (S_ISDIR(inode->vfs_inode.i_mode)) ret = btrfs_commit_inode_delayed_items(trans, inode); else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ret = btrfs_commit_inode_delayed_inode(inode); From 28a9579561bcb9082715e720eac93012e708ab94 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:32:31 +0100 Subject: [PATCH 101/151] btrfs: stop incremening log_batch for the log root tree when syncing log We are incrementing the log_batch atomic counter of the root log tree but we never use that counter, it's used only for the log trees of subvolume roots. We started doing it when we moved the log_batch and log_write counters from the global, per fs, btrfs_fs_info structure, into the btrfs_root structure in commit 7237f1833601dc ("Btrfs: fix tree logs parallel sync"). So just stop doing it for the log root tree and add a comment over the field declaration so inform it's used only for log trees of subvolume roots. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/tree-log.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2b0d95a4b0da..726e2de685ce 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1062,6 +1062,7 @@ struct btrfs_root { struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; int log_transid; /* No matter the commit succeeds or not*/ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7c325451d47f..ccc6b59f6729 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3116,7 +3116,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_batch); atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; From a93e01682e283f6de09d6ce8f805dc52a2e942fb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 2 Jul 2020 12:32:40 +0100 Subject: [PATCH 102/151] btrfs: remove no longer needed use of log_writers for the log root tree When syncing the log, we used to update the log root tree without holding neither the log_mutex of the subvolume root nor the log_mutex of log root tree. We used to have two critical sections delimited by the log_mutex of the log root tree, so in the first one we incremented the log_writers of the log root tree and on the second one we decremented it and waited for the log_writers counter to go down to zero. This was because the update of the log root tree happened between the two critical sections. The use of two critical sections allowed a little bit more of parallelism and required the use of the log_writers counter, necessary to make sure we didn't miss any log root tree update when we have multiple tasks trying to sync the log in parallel. However after commit 06989c799f0481 ("Btrfs: fix race updating log root item during fsync") the log root tree update was moved into a critical section delimited by the subvolume's log_mutex. Later another commit moved the log tree update from that critical section into the second critical section delimited by the log_mutex of the log root tree. Both commits addressed different bugs. The end result is that the first critical section delimited by the log_mutex of the log root tree became pointless, since there's nothing done between it and the second critical section, we just have an unlock of the log_mutex followed by a lock operation. This means we can merge both critical sections, as the first one does almost nothing now, and we can stop using the log_writers counter of the log root tree, which was incremented in the first critical section and decremented in the second criticial section, used to make sure no one in the second critical section started writeback of the log root tree before some other task updated it. So just remove the mutex_unlock() followed by mutex_lock() of the log root tree, as well as the use of the log_writers counter for the log root tree. This patch is part of a series that has the following patches: 1/4 btrfs: only commit the delayed inode when doing a full fsync 2/4 btrfs: only commit delayed items at fsync if we are logging a directory 3/4 btrfs: stop incremening log_batch for the log root tree when syncing log 4/4 btrfs: remove no longer needed use of log_writers for the log root tree After the entire patchset applied I saw about 12% decrease on max latency reported by dbench. The test was done on a qemu vm, with 8 cores, 16Gb of ram, using kvm and using a raw NVMe device directly (no intermediary fs on the host). The test was invoked like the following: mkfs.btrfs -f /dev/sdk mount -o ssd -o nospace_cache /dev/sdk /mnt/sdk dbench -D /mnt/sdk -t 300 8 umount /mnt/dsk CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/tree-log.c | 13 ------------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 726e2de685ce..c083628e96e8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1060,6 +1060,7 @@ struct btrfs_root { wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; + /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ccc6b59f6729..aaa449153d9c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3116,28 +3116,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); - atomic_inc(&log_root_tree->log_writers); index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; - mutex_unlock(&log_root_tree->log_mutex); - - mutex_lock(&log_root_tree->log_mutex); - /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit * open until we drop the log_mutex. */ ret = update_log_root(trans, log, &new_root_item); - - if (atomic_dec_and_test(&log_root_tree->log_writers)) { - /* atomic_dec_and_test implies a barrier */ - cond_wake_up_nomb(&log_root_tree->log_writer_wait); - } - if (ret) { if (!list_empty(&root_log_ctx.list)) list_del_init(&root_log_ctx.list); @@ -3183,8 +3172,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root_log_ctx.log_transid - 1); } - wait_for_writer(log_root_tree); - /* * now that we've moved on to the tree of log tree roots, * check the full commit flag again From d801e7a3557ecc995bdfd6f142a36f0bb774c737 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Jul 2020 17:02:34 +0200 Subject: [PATCH 103/151] btrfs: remove deprecated mount option alloc_start The mount option alloc_start has no effect since 0d0c71b31720 ("btrfs: obsolete and remove mount option alloc_start") which has details why it's been deprecated. We can remove it. Signed-off-by: David Sterba --- fs/btrfs/super.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dac04e28643e..01bad704ff81 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -347,7 +347,6 @@ enum { Opt_nologreplay, /* Deprecated options */ - Opt_alloc_start, Opt_recovery, Opt_subvolrootid, @@ -420,7 +419,6 @@ static const match_table_t tokens = { {Opt_usebackuproot, "usebackuproot"}, /* Deprecated options */ - {Opt_alloc_start, "alloc_start=%s"}, {Opt_recovery, "recovery"}, {Opt_subvolrootid, "subvolrootid=%d"}, @@ -720,10 +718,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; } break; - case Opt_alloc_start: - btrfs_info(info, - "option alloc_start is obsolete, ignored"); - break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL info->sb->s_flags |= SB_POSIXACL; From b90a4ab6ba9cce79a1ac06a250d5fc8c3dff382b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Jul 2020 17:08:43 +0200 Subject: [PATCH 104/151] btrfs: remove deprecated mount option subvolrootid The option subvolrootid used to be a workaround for mounting subvolumes and ineffective since 5e2a4b25da23 ("btrfs: deprecate subvolrootid mount option"). We have subvol= that works and we don't need to keep the cruft, let's remove it. Signed-off-by: David Sterba --- fs/btrfs/super.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 01bad704ff81..7c052a542412 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -348,7 +348,6 @@ enum { /* Deprecated options */ Opt_recovery, - Opt_subvolrootid, /* Debugging options */ Opt_check_integrity, @@ -420,7 +419,6 @@ static const match_table_t tokens = { /* Deprecated options */ {Opt_recovery, "recovery"}, - {Opt_subvolrootid, "subvolrootid=%d"}, /* Debugging options */ {Opt_check_integrity, "check_int"}, @@ -534,7 +532,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvol: case Opt_subvol_empty: case Opt_subvolid: - case Opt_subvolrootid: case Opt_device: /* * These are parsed by btrfs_parse_subvol_options or @@ -1081,9 +1078,6 @@ static int btrfs_parse_subvol_options(const char *options, char **subvol_name, *subvol_objectid = subvolid; break; - case Opt_subvolrootid: - pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); - break; default: break; } From 2279a270534c678e78f961fed8e09c9c69611ceb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 15:23:28 +0300 Subject: [PATCH 105/151] btrfs: make get_state_failrec return failrec directly Only failure that get_state_failrec can get is if there is no failure for the given address. There is no reason why the function should return a status code and use a separate parameter for returning the actual failure rec (if one is found). Simplify it by making the return type a pointer and return ERR_PTR value in case of errors. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 3 +-- fs/btrfs/extent_io.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index b6561455b3c4..b6db2b1f9b06 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -233,8 +233,7 @@ bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, struct extent_state **cached_state); /* This should be reworked in the future and put elsewhere. */ -int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec); +struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start); int set_state_failrec(struct extent_io_tree *tree, u64 start, struct io_failure_record *failrec); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index abd36d92c077..59ec53464746 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2122,12 +2122,11 @@ out: return ret; } -int get_state_failrec(struct extent_io_tree *tree, u64 start, - struct io_failure_record **failrec) +struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) { struct rb_node *node; struct extent_state *state; - int ret = 0; + struct io_failure_record *failrec; spin_lock(&tree->lock); /* @@ -2136,18 +2135,19 @@ int get_state_failrec(struct extent_io_tree *tree, u64 start, */ node = tree_search(tree, start); if (!node) { - ret = -ENOENT; + failrec = ERR_PTR(-ENOENT); goto out; } state = rb_entry(node, struct extent_state, rb_node); if (state->start != start) { - ret = -ENOENT; + failrec = ERR_PTR(-ENOENT); goto out; } - *failrec = state->failrec; + + failrec = state->failrec; out: spin_unlock(&tree->lock); - return ret; + return failrec; } /* @@ -2377,8 +2377,8 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = get_state_failrec(failure_tree, start, &failrec); - if (ret) + failrec = get_state_failrec(failure_tree, start); + if (IS_ERR(failrec)) return 0; BUG_ON(!failrec->this_mirror); @@ -2462,8 +2462,8 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, int ret; u64 logical; - ret = get_state_failrec(failure_tree, start, &failrec); - if (ret) { + failrec = get_state_failrec(failure_tree, start); + if (IS_ERR(failrec)) { failrec = kzalloc(sizeof(*failrec), GFP_NOFS); if (!failrec) return -ENOMEM; From 3526302f26162ae738926d44eb39e2e0780b11bc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 15:23:29 +0300 Subject: [PATCH 106/151] btrfs: streamline btrfs_get_io_failure_record logic Make the function directly return a pointer to a failure record and adjust callers to handle it. Also refactor the logic inside so that the case which allocates the failure record for the first time is not handled in an 'if' arm, saving us a level of indentation. Finally make the function static as it's not used outside of extent_io.c . Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 2 - fs/btrfs/extent_io.c | 129 +++++++++++++++++++------------------- 2 files changed, 63 insertions(+), 68 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index b6db2b1f9b06..f39d47a2d01a 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -238,8 +238,6 @@ int set_state_failrec(struct extent_io_tree *tree, u64 start, struct io_failure_record *failrec); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret); int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 59ec53464746..e37950a576fc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2450,8 +2450,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) spin_unlock(&failure_tree->lock); } -int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, - struct io_failure_record **failrec_ret) +static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, + u64 start, u64 end) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2463,64 +2463,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, u64 logical; failrec = get_state_failrec(failure_tree, start); - if (IS_ERR(failrec)) { - failrec = kzalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - - failrec->start = start; - failrec->len = end - start + 1; - failrec->this_mirror = 0; - failrec->bio_flags = 0; - failrec->in_validation = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return -EIO; - } - - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { - kfree(failrec); - return -EIO; - } - - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - - /* set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY); - if (ret >= 0) - ret = set_state_failrec(failure_tree, start, failrec); - /* set the bits in the inode's tree */ - if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); - if (ret < 0) { - kfree(failrec); - return ret; - } - } else { + if (!IS_ERR(failrec)) { btrfs_debug(fs_info, "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", failrec->logical, failrec->start, failrec->len, @@ -2530,11 +2473,66 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ + + return failrec; } - *failrec_ret = failrec; + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return ERR_PTR(-ENOMEM); - return 0; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return ERR_PTR(-EIO); + } + + if (em->start > start || em->start + em->len <= start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + if (!em) { + kfree(failrec); + return ERR_PTR(-EIO); + } + + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, em->compress_type); + } + + btrfs_debug(fs_info, + "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", + logical, start, failrec->len); + + failrec->logical = logical; + free_extent_map(em); + + /* Set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY); + if (ret >= 0) { + ret = set_state_failrec(failure_tree, start, failrec); + /* Set the bits in the inode's tree */ + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); + } else if (ret < 0) { + kfree(failrec); + return ERR_PTR(ret); + } + + return failrec; } static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, @@ -2659,16 +2657,15 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; blk_status_t status; - int ret; btrfs_debug(fs_info, "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - ret = btrfs_get_io_failure_record(inode, start, end, &failrec); - if (ret) - return errno_to_blk_status(ret); + failrec = btrfs_get_io_failure_record(inode, start, end); + if (IS_ERR(failrec)) + return errno_to_blk_status(PTR_ERR(failrec)); need_validation = btrfs_io_needs_validation(inode, failed_bio); From c31efbdf23a09468c9352f05443d5624dbbfd3c7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 3 Jul 2020 11:14:27 +0300 Subject: [PATCH 107/151] btrfs: record btrfs_device directly in btrfs_io_bio Instead of recording stripe_index and using that to access correct btrfs_device from btrfs_bio::stripes record the btrfs_device in btrfs_io_bio. This will enable endio handlers to increment device error counters on checksum errors. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 1 + fs/btrfs/volumes.c | 14 ++++---------- fs/btrfs/volumes.h | 2 +- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c870ef70f817..570f6bf51c6e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1117,6 +1117,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* put a new bio on the list */ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); + btrfs_io_bio(bio)->device = stripe->dev; bio->bi_iter.bi_size = 0; bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 62ae89b078f4..fb3502166e29 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6261,12 +6261,8 @@ static void btrfs_end_bio(struct bio *bio) atomic_inc(&bbio->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - unsigned int stripe_index = - btrfs_io_bio(bio)->stripe_index; - struct btrfs_device *dev; + struct btrfs_device *dev = btrfs_io_bio(bio)->device; - BUG_ON(stripe_index >= bbio->num_stripes); - dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc_and_print(dev, @@ -6313,13 +6309,12 @@ static void btrfs_end_bio(struct bio *bio) } static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, - u64 physical, int dev_nr) + u64 physical, struct btrfs_device *dev) { - struct btrfs_device *dev = bbio->stripes[dev_nr].dev; struct btrfs_fs_info *fs_info = bbio->fs_info; bio->bi_private = bbio; - btrfs_io_bio(bio)->stripe_index = dev_nr; + btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; btrfs_debug_in_rcu(fs_info, @@ -6420,8 +6415,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, else bio = first_bio; - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, - dev_nr); + submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 75af2334b2e3..5eea93916fbf 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -288,7 +288,7 @@ struct btrfs_fs_devices { */ struct btrfs_io_bio { unsigned int mirror_num; - unsigned int stripe_index; + struct btrfs_device *device; u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; From 3eee86c8fd9a9cdeba7afa943ae00bd166e2899a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 15:23:31 +0300 Subject: [PATCH 108/151] btrfs: don't check for btrfs_device::bdev in btrfs_end_bio btrfs_map_bio ensures that all submitted bios to devices have valid btrfs_device::bdev so this check can be removed from btrfs_end_bio. This check was added in june 2012 597a60fadedf ("Btrfs: don't count I/O statistic read errors for missing devices") but then in October of the same year another commit de1ee92ac3bc ("Btrfs: recheck bio against block device when we map the bio") started checking for the presence of btrfs_device::bdev before actually issuing the bio. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fb3502166e29..98f75f795ff2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6263,17 +6263,16 @@ static void btrfs_end_bio(struct bio *bio) bio->bi_status == BLK_STS_TARGET) { struct btrfs_device *dev = btrfs_io_bio(bio)->device; - if (dev->bdev) { - if (bio_op(bio) == REQ_OP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + ASSERT(dev->bdev); + if (bio_op(bio) == REQ_OP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + else if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); - } } } From 814723e0a55a9576b1d17f4fa8811086a24dd3e8 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 15:23:32 +0300 Subject: [PATCH 109/151] btrfs: increment device corruption error in case of checksum error Now that btrfs_io_bio have access to btrfs_device we can safely increment the device corruption counter on error. There is one notable exception - repair bios for raid. Since those don't go through the normal submit_stripe_bio callpath but through raid56_parity_recover thus repair bios won't have their device set. Scrub increments the corruption counter for checksum mismatch as well but does not call this function. Link: https://lore.kernel.org/linux-btrfs/4857863.FCrPRfMyHP@liv/ Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f066cad2d039..0fa4f7007ff9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2815,6 +2815,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, io_bio->mirror_num); + if (io_bio->device) + btrfs_dev_stat_inc_and_print(io_bio->device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); From 26056eab4bf7b7b1a690c0242bd1cc21e050349a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 15:23:33 +0300 Subject: [PATCH 110/151] btrfs: remove needless ASSERT check of orig_bio in end_compressed_bio_read compressed_bio::orig_bio is always set in btrfs_submit_compressed_read before any bio submission is performed. Since that function is always called with a valid bio it renders the ASSERT unnecessary. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c2d5ca583dbf..db80c3fa6c08 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -244,7 +244,6 @@ static void end_compressed_bio_read(struct bio *bio) * Record the correct mirror_num in cb->orig_bio so that * read-repair can work properly. */ - ASSERT(btrfs_io_bio(cb->orig_bio)); btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; From 5a9472fe7ffecf7390ddcb379fc4d65d392e2373 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 15:23:34 +0300 Subject: [PATCH 111/151] btrfs: increment corrupt device counter during compressed read If a compressed read fails due to checksum error only a line is printed to dmesg, device corrupt counter is not modified. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index db80c3fa6c08..7b7aa7b2f872 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -172,8 +172,7 @@ static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } -static int check_compressed_csum(struct btrfs_inode *inode, - struct compressed_bio *cb, +static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, u64 disk_start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -184,6 +183,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, unsigned long i; char *kaddr; u8 csum[BTRFS_CSUM_SIZE]; + struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) @@ -201,6 +201,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, if (memcmp(&csum, cb_sum, csum_size)) { btrfs_print_data_csum_error(inode, disk_start, csum, cb_sum, cb->mirror_num); + if (btrfs_io_bio(bio)->device) + btrfs_dev_stat_inc_and_print( + btrfs_io_bio(bio)->device, + BTRFS_DEV_STAT_CORRUPTION_ERRS); ret = -EIO; goto fail; } @@ -255,7 +259,7 @@ static void end_compressed_bio_read(struct bio *bio) goto csum_failed; inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), cb, + ret = check_compressed_csum(BTRFS_I(inode), bio, (u64)bio->bi_iter.bi_sector << 9); if (ret) goto csum_failed; From 3092c68fc58c7bd54a6dde8d42ef05beb41890f5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 3 Jul 2020 11:13:15 +0300 Subject: [PATCH 112/151] btrfs: sysfs: add bdi link to the fsid directory Since BTRFS uses a private bdi it makes sense to create a link to this bdi under /sys/fs/btrfs//bdi. This allows size of read ahead to be controlled. Without this patch it's not possible to uniquely identify which bdi pertains to which btrfs filesystem in the case of multiple btrfs filesystems. It's fine to simply call sysfs_remove_link without checking if the link indeed has been created. The call path sysfs_remove_link kernfs_remove_by_name kernfs_remove_by_name_ns will simply return -ENOENT in case it doesn't exist. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5885abe57c3e..38c0b95e0e7f 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -937,8 +937,12 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { + struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; + btrfs_reset_fs_info_ptr(fs_info); + sysfs_remove_link(fsid_kobj, "bdi"); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); @@ -958,8 +962,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) } #endif addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(fsid_kobj, btrfs_attrs); btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); } @@ -1439,6 +1443,10 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) if (error) goto failure; + error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi"); + if (error) + goto failure; + fs_info->space_info_kobj = kobject_create_and_add("allocation", fsid_kobj); if (!fs_info->space_info_kobj) { From 608769a4e41cceca6908f1807ebe95e0a07a21d3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 16:46:41 +0300 Subject: [PATCH 113/151] btrfs: always initialize btrfs_bio::tgtdev_map/raid_map pointers Since btrfs_bio always contains the extra space for the tgtdev_map and raid_maps it's pointless to make the assignment iff specific conditions are met. Instead, always assign the pointers to their correct value at allocation time. To accommodate this change also move code a bit in __btrfs_map_block so that btrfs_bio::stripes array is always initialized before the raid_map, subsequently move the call to sort_parity_stripes in the 'if' building the raid_map, retaining the old behavior. To better understand the change, there are 2 aspects to this: 1. The original code is harder to grasp because the calculations for initializing raid_map/tgtdev ponters are apart from the initial allocation of memory. Having them predicated on 2 separate checks doesn't help that either... So by moving the initialisation in alloc_btrfs_bio puts everything together. 2. tgtdev/raid_maps are now always initialized despite sometimes they might be equal i.e __btrfs_map_block_for_discard calls alloc_btrfs_bio with tgtdev = 0 but their usage should be predicated on external checks i.e. just because those pointers are non-null doesn't mean they are valid per-se. And actually while taking another look at __btrfs_map_block I saw a discrepancy: Original code initialised tgtdev_map if the following check is true: if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) However, further down tgtdev_map is only used if the following check is true: if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) e.g. the additional need_full_stripe(op) predicate is there. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ copy more details from mail discussion ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 98f75f795ff2..19213347ec70 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5522,6 +5522,9 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) atomic_set(&bbio->error, 0); refcount_set(&bbio->refs, 1); + bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); + bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); + return bbio; } @@ -6144,8 +6147,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + stripe_index++; + } /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && @@ -6153,11 +6161,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 tmp; unsigned rot; - bbio->raid_map = (u64 *)((void *)bbio->stripes + - sizeof(struct btrfs_bio_stripe) * - num_alloc_stripes + - sizeof(int) * tgtdev_indexes); - /* Work out the disk rotation on this stripe-set */ div_u64_rem(stripe_nr, num_stripes, &rot); @@ -6171,25 +6174,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID6) bbio->raid_map[(i+rot+1) % num_stripes] = RAID6_Q_STRIPE; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; + sort_parity_stripes(bbio, num_stripes); } if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); - if (bbio->raid_map) - sort_parity_stripes(bbio, num_stripes); - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, From f90ae76a5c2c746c06f11a1709e9149ab2666ea4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 16:46:42 +0300 Subject: [PATCH 114/151] btrfs: raid56: remove redundant device check in rbio_add_io_page The merging logic is always executed if the current stripe's device is not missing. So there's no point in duplicating the check. Simply remove it, while at it reduce the scope of the 'last_end' variable. If the current stripe's device is missing we fail the stripe early on. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 570f6bf51c6e..e493596f1c0d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1083,7 +1083,6 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, unsigned long bio_max_len) { struct bio *last = bio_list->tail; - u64 last_end = 0; int ret; struct bio *bio; struct btrfs_bio_stripe *stripe; @@ -1098,15 +1097,14 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - last_end = (u64)last->bi_iter.bi_sector << 9; + u64 last_end = (u64)last->bi_iter.bi_sector << 9; last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different * devices or if they are not contiguous */ - if (last_end == disk_start && stripe->dev->bdev && - !last->bi_status && + if (last_end == disk_start && !last->bi_status && last->bi_disk == stripe->dev->bdev->bd_disk && last->bi_partno == stripe->dev->bdev->bd_partno) { ret = bio_add_page(last, page, PAGE_SIZE, 0); From bf28a605e626a7c0169584f22f5dc96ce92ae813 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 16:46:43 +0300 Subject: [PATCH 115/151] btrfs: raid56: assign bio in while() when using bio_list_pop Unify the style in the file such that return value of bio_list_pop is assigned directly in the while loop. This is in line with the rest of the kernel. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e493596f1c0d..d91cca43989a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1324,11 +1324,7 @@ write_data: atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; bio->bi_opf = REQ_OP_WRITE; @@ -1566,11 +1562,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; bio->bi_opf = REQ_OP_READ; @@ -2112,11 +2104,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; bio->bi_opf = REQ_OP_READ; @@ -2481,11 +2469,7 @@ submit_write: atomic_set(&rbio->stripes_pending, nr_data); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; bio->bi_opf = REQ_OP_WRITE; @@ -2663,11 +2647,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * not to touch it after that */ atomic_set(&rbio->stripes_pending, bios_to_read); - while (1) { - bio = bio_list_pop(&bio_list); - if (!bio) - break; - + while ((bio = bio_list_pop(&bio_list))) { bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; bio->bi_opf = REQ_OP_READ; From 830258632757cad2ffb5cbc86f266ca475caf4af Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 16:46:45 +0300 Subject: [PATCH 116/151] btrfs: raid56: use in_range where applicable While at it use the opportunity to simplify find_logical_bio_stripe by reducing the scope of 'stripe_start' variable and squash the sector-to-bytes conversion on one line. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d91cca43989a..c09b1c228372 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1349,7 +1349,6 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { u64 physical = bio->bi_iter.bi_sector; - u64 stripe_start; int i; struct btrfs_bio_stripe *stripe; @@ -1357,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bbio->num_stripes; i++) { stripe = &rbio->bbio->stripes[i]; - stripe_start = stripe->physical; - if (physical >= stripe_start && - physical < stripe_start + rbio->stripe_len && + if (in_range(physical, stripe->physical, rbio->stripe_len) && stripe->dev->bdev && bio->bi_disk == stripe->dev->bdev->bd_disk && bio->bi_partno == stripe->dev->bdev->bd_partno) { @@ -1377,18 +1374,14 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio) { - u64 logical = bio->bi_iter.bi_sector; - u64 stripe_start; + u64 logical = (u64)bio->bi_iter.bi_sector << 9; int i; - logical <<= 9; - for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->bbio->raid_map[i]; - if (logical >= stripe_start && - logical < stripe_start + rbio->stripe_len) { + u64 stripe_start = rbio->bbio->raid_map[i]; + + if (in_range(logical, stripe_start, rbio->stripe_len)) return i; - } } return -1; } From b7d2083a368332b40309c16bb2e7d48ecc920489 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 16:46:46 +0300 Subject: [PATCH 117/151] btrfs: raid56: don't opencode swap() in __raid_recover_end_io Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c09b1c228372..61ff77392357 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1862,11 +1862,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* make sure our ps and qs are in order */ - if (faila > failb) { - int tmp = failb; - failb = faila; - faila = tmp; - } + if (faila > failb) + swap(faila, failb); /* if the q stripe is failed, do a pstripe reconstruction * from the xors. From 93c4c033ec66d1de90d03bc11b7b338a01188954 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 2 Jul 2020 16:46:47 +0300 Subject: [PATCH 118/151] btrfs: remove fail label in check_compressed_csum Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7b7aa7b2f872..1ab56a734e70 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -178,7 +178,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - int ret; struct page *page; unsigned long i; char *kaddr; @@ -205,15 +204,11 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, btrfs_dev_stat_inc_and_print( btrfs_io_bio(bio)->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); - ret = -EIO; - goto fail; + return -EIO; } cb_sum += csum_size; - } - ret = 0; -fail: - return ret; + return 0; } /* when we finish reading compressed pages from the disk, we From 60f8667b618e1cb2ded148a7b767bbb70d6c191c Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 6 Jul 2020 11:59:36 -0300 Subject: [PATCH 119/151] btrfs: add multi-statement protection to btrfs_set/clear_and_info macros Multi-statement macros should be enclosed in do/while(0) block to make their use safe in single statement if conditions. All current uses of the macros are safe, so this change is for future protection. Reviewed-by: Anand Jain Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c083628e96e8..7a4ce885bd21 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1279,18 +1279,18 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) BTRFS_MOUNT_##opt) #define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -{ \ +do { \ if (!btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_set_opt(fs_info->mount_opt, opt); \ -} +} while (0) #define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -{ \ +do { \ if (btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_clear_opt(fs_info->mount_opt, opt); \ -} +} while (0) /* * Requests for changes that need to be done during transaction commit. From 48aaeebe4e1f3c76a703dc40e9245af1753dcefe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Jul 2020 09:14:11 -0400 Subject: [PATCH 120/151] btrfs: convert block group refcount to refcount_t We have refcount_t now with the associated library to handle refcounts, which gives us extra debugging around reference count mistakes that may be made. For example it'll warn on any transition from 0->1 or 0->-1, which is handy for noticing cases where we've messed up reference counting. Convert the block group ref counting from an atomic_t to refcount_t and use the appropriate helpers. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++++---- fs/btrfs/block-group.h | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6ade9d345e66..884de28a41e4 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -118,12 +118,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) void btrfs_get_block_group(struct btrfs_block_group *cache) { - atomic_inc(&cache->count); + refcount_inc(&cache->refs); } void btrfs_put_block_group(struct btrfs_block_group *cache) { - if (atomic_dec_and_test(&cache->count)) { + if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); @@ -1805,7 +1805,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; - atomic_set(&cache->count, 1); + refcount_set(&cache->refs, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); @@ -3380,7 +3380,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(list_empty(&block_group->dirty_list)); ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); - ASSERT(atomic_read(&block_group->count) == 1); + ASSERT(refcount_read(&block_group->refs) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index b6ee70a039c7..adfd7583a17b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -114,8 +114,7 @@ struct btrfs_block_group { /* For block groups in the same raid type */ struct list_head list; - /* Usage count */ - atomic_t count; + refcount_t refs; /* * List of struct btrfs_free_clusters for this block group. From 263da812e87bac4098a4778efaa32c54275641db Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jul 2020 14:24:45 +0800 Subject: [PATCH 121/151] btrfs: qgroup: allow to unreserve range without releasing other ranges [PROBLEM] Before this patch, when btrfs_qgroup_reserve_data() fails, we free all reserved space of the changeset. For example: ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M); ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M); ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M); If the last btrfs_qgroup_reserve_data() failed, it will release the entire [0, 3M) range. This behavior is kind of OK for now, as when we hit -EDQUOT, we normally go error handling and need to release all reserved ranges anyway. But this also means the following call is not possible: ret = btrfs_qgroup_reserve_data(); if (ret == -EDQUOT) { /* Do something to free some qgroup space */ ret = btrfs_qgroup_reserve_data(); } As if the first btrfs_qgroup_reserve_data() fails, it will free all reserved qgroup space. [CAUSE] This is because we release all reserved ranges when btrfs_qgroup_reserve_data() fails. [FIX] This patch will implement a new function, qgroup_unreserve_range(), to iterate through the ulist nodes, to find any nodes in the failure range, and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and decrease the extent_changeset::bytes_changed, so that we can revert to previous state. This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT happens. Suggested-by: Josef Bacik Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 92 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ee0ad33b659c..054f36df60cf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3447,6 +3447,73 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) } } +#define rbtree_iterate_from_safe(node, next, start) \ + for (node = start; node && ({ next = rb_next(node); 1;}); node = next) + +static int qgroup_unreserve_range(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len) +{ + struct rb_node *node; + struct rb_node *next; + struct ulist_node *entry = NULL; + int ret = 0; + + node = reserved->range_changed.root.rb_node; + while (node) { + entry = rb_entry(node, struct ulist_node, rb_node); + if (entry->val < start) + node = node->rb_right; + else if (entry) + node = node->rb_left; + else + break; + } + + /* Empty changeset */ + if (!entry) + return 0; + + if (entry->val > start && rb_prev(&entry->rb_node)) + entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, + rb_node); + + rbtree_iterate_from_safe(node, next, &entry->rb_node) { + u64 entry_start; + u64 entry_end; + u64 entry_len; + int clear_ret; + + entry = rb_entry(node, struct ulist_node, rb_node); + entry_start = entry->val; + entry_end = entry->aux; + entry_len = entry_end - entry_start + 1; + + if (entry_start >= start + len) + break; + if (entry_start + entry_len <= start) + continue; + /* + * Now the entry is in [start, start + len), revert the + * EXTENT_QGROUP_RESERVED bit. + */ + clear_ret = clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); + if (!ret && clear_ret < 0) + ret = clear_ret; + + ulist_del(&reserved->range_changed, entry->val, entry->aux); + if (likely(reserved->bytes_changed >= entry_len)) { + reserved->bytes_changed -= entry_len; + } else { + WARN_ON(1); + reserved->bytes_changed = 0; + } + } + + return ret; +} + /* * Reserve qgroup space for range [start, start + len). * @@ -3457,18 +3524,14 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. - * if btrfs_qgroup_reserve_data() is called multiple times with - * same @reserved, caller must ensure when error happens it's OK - * to free *ALL* reserved space. */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { struct btrfs_root *root = inode->root; - struct ulist_node *unode; - struct ulist_iterator uiter; struct extent_changeset *reserved; + bool new_reserved = false; u64 orig_reserved; u64 to_reserve; int ret; @@ -3481,6 +3544,7 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, if (WARN_ON(!reserved_ret)) return -EINVAL; if (!*reserved_ret) { + new_reserved = true; *reserved_ret = extent_changeset_alloc(); if (!*reserved_ret) return -ENOMEM; @@ -3496,7 +3560,7 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) - goto cleanup; + goto out; ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; @@ -3504,15 +3568,13 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; cleanup: - /* cleanup *ALL* already reserved ranges */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&inode->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); - /* Also free data bytes of already reserved one */ - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, - orig_reserved, BTRFS_QGROUP_RSV_DATA); - extent_changeset_release(reserved); + qgroup_unreserve_range(inode, reserved, start, len); +out: + if (new_reserved) { + extent_changeset_release(reserved); + kfree(reserved); + *reserved_ret = NULL; + } return ret; } From c53e9653605dbf708f5be02902de51831be4b009 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 18:50:48 +0800 Subject: [PATCH 122/151] btrfs: qgroup: try to flush qgroup space when we get -EDQUOT [PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range. btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff) [CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not. Such behavior change is mostly for performance, and reverting it is not a good idea anyway. For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space. This leads to the -EDQUOT in buffered write routine. And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution. [FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space: - Flush all inodes of the root NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway. - Wait for ordered extents This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit. - Commit transaction This will free all per-trans metadata space. Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing. Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7a4ce885bd21..13a0524cc048 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1012,6 +1012,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, }; /* @@ -1164,6 +1166,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait; /* Number of active swapfiles */ atomic_t nr_swapfiles; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b307034d32c..4360937acc2f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1116,6 +1116,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 054f36df60cf..23579dc39fa6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3515,17 +3515,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, } /* - * Reserve qgroup space for range [start, start + len). + * Try to free some space for qgroup. * - * This function will either reserve space from related qgroups or doing - * nothing if the range is already reserved. + * For qgroup, there are only 3 ways to free qgroup space: + * - Flush nodatacow write + * Any nodatacow write will free its reserved data space at run_delalloc_range(). + * In theory, we should only flush nodatacow inodes, but it's not yet + * possible, so we need to flush the whole root. * - * Return 0 for successful reserve - * Return <0 for error (including -EQUOT) + * - Wait for ordered extents + * When ordered extents are finished, their reserved metadata is finally + * converted to per_trans status, which can be freed by later commit + * transaction. * - * NOTE: this function may sleep for memory allocation. + * - Commit transaction + * This would free the meta_per_trans space. + * In theory this shouldn't provide much space, but any more qgroup space + * is needed. */ -int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + } + + ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +} + +static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3578,6 +3619,34 @@ out: return ret; } +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or do nothing + * if the range is already reserved. + * + * Return 0 for successful reservation + * Return <0 for error (including -EQUOT) + * + * NOTE: This function may sleep for memory allocation, dirty page flushing and + * commit transaction. So caller should not hold any dirty page locked. + */ +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret; + + ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +} + /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) @@ -3746,7 +3815,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; } -int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3773,6 +3842,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; } +int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret; + + ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +} + void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; From adca4d945c8dca28a85df45c5b117e6dac2e77f1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 18:50:49 +0800 Subject: [PATCH 123/151] btrfs: qgroup: remove ASYNC_COMMIT mechanism in favor of reserve retry-after-EDQUOT commit a514d63882c3 ("btrfs: qgroup: Commit transaction in advance to reduce early EDQUOT") tries to reduce the early EDQUOT problems by checking the qgroup free against threshold and tries to wake up commit kthread to free some space. The problem of that mechanism is, it can only free qgroup per-trans metadata space, can't do anything to data, nor prealloc qgroup space. Now since we have the ability to flush qgroup space, and implemented retry-after-EDQUOT behavior, such mechanism can be completely replaced. So this patch will cleanup such mechanism in favor of retry-after-EDQUOT. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 ----- fs/btrfs/disk-io.c | 1 - fs/btrfs/qgroup.c | 43 ++---------------------------------------- fs/btrfs/transaction.c | 1 - fs/btrfs/transaction.h | 14 -------------- 5 files changed, 2 insertions(+), 62 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 13a0524cc048..050760dfc56d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -545,11 +545,6 @@ enum { * (device replace, resize, device add/delete, balance) */ BTRFS_FS_EXCL_OP, - /* - * To info transaction_kthread we need an immediate commit so it - * doesn't need to wait for commit_interval - */ - BTRFS_FS_NEED_ASYNC_COMMIT, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4360937acc2f..c850d7f44fbe 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1810,7 +1810,6 @@ static int transaction_kthread(void *arg) now = ktime_get_seconds(); if (cur->state < TRANS_STATE_COMMIT_START && - !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 23579dc39fa6..c0f350c3a0cf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "ctree.h" #include "transaction.h" @@ -2895,20 +2894,8 @@ out: return ret; } -/* - * Two limits to commit transaction in advance. - * - * For RATIO, it will be 1/RATIO of the remaining limit as threshold. - * For SIZE, it will be in byte unit as threshold. - */ -#define QGROUP_FREE_RATIO 32 -#define QGROUP_FREE_SIZE SZ_32M -static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup *qg, u64 num_bytes) +static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 free; - u64 threshold; - if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2917,32 +2904,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false; - /* - * Even if we passed the check, it's better to check if reservation - * for meta_pertrans is pushing us near limit. - * If there is too much pertrans reservation or it's near the limit, - * let's try commit transaction to free some, using transaction_kthread - */ - if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | - BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { - free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; - threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } else { - free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; - threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } - - /* - * Use transaction_kthread to commit transaction, so we no - * longer need to bother nested transaction nor lock context. - */ - if (free < threshold) - btrfs_commit_transaction_locksafe(fs_info); - } - return true; } @@ -2990,7 +2951,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, qg = unode_aux_to_qgroup(unode); - if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qg, num_bytes)) { ret = -EDQUOT; goto out; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3a7c392fbcc9..efafc286323c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2351,7 +2351,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 98f7860af5e9..d60b055b8695 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -210,20 +210,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); - -/* - * Try to commit transaction asynchronously, so this is safe to call - * even holding a spinlock. - * - * It's done by informing transaction_kthread to commit transaction without - * waiting for commit interval. - */ -static inline void btrfs_commit_transaction_locksafe( - struct btrfs_fs_info *fs_info) -{ - set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); - wake_up_process(fs_info->transaction_kthread); -} int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); From 137c541821a83debb63b3fa8abdd1cbc41bdf3a1 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:28:58 +0900 Subject: [PATCH 124/151] btrfs: pass checksum type via BTRFS_IOC_FS_INFO ioctl With the recent addition of filesystem checksum types other than CRC32c, it is not anymore hard-coded which checksum type a btrfs filesystem uses. Up to now there is no good way to read the filesystem checksum, apart from reading the filesystem UUID and then query sysfs for the checksum type. Add a new csum_type and csum_size fields to the BTRFS_IOC_FS_INFO ioctl command which usually is used to query filesystem features. Also add a flags member indicating that the kernel responded with a set csum_type and csum_size field. For compatibility reasons, only return the csum_type and csum_size if the BTRFS_FS_INFO_FLAG_CSUM_INFO flag was passed to the kernel. Also clear any unknown flags so we don't pass false positives to user-space newer than the kernel. To simplify further additions to the ioctl, also switch the padding to a u8 array. Pahole was used to verify the result of this switch: The csum members are added before flags, which might look odd, but this is to keep the alignment requirements and not to introduce holes in the structure. $ pahole -C btrfs_ioctl_fs_info_args fs/btrfs/btrfs.ko struct btrfs_ioctl_fs_info_args { __u64 max_id; /* 0 8 */ __u64 num_devices; /* 8 8 */ __u8 fsid[16]; /* 16 16 */ __u32 nodesize; /* 32 4 */ __u32 sectorsize; /* 36 4 */ __u32 clone_alignment; /* 40 4 */ __u16 csum_type; /* 44 2 */ __u16 csum_size; /* 46 2 */ __u64 flags; /* 48 8 */ __u8 reserved[968]; /* 56 968 */ /* size: 1024, cachelines: 16, members: 10 */ }; Fixes: 3951e7f050ac ("btrfs: add xxhash64 to checksumming algorithms") Fixes: 3831bf0094ab ("btrfs: add sha256 to checksumming algorithm") CC: stable@vger.kernel.org # 5.5+ Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++++++++--- include/uapi/linux/btrfs.h | 14 ++++++++++++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ab34179d7cbc..2854f4a40787 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3217,11 +3217,15 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 flags_in; int ret = 0; - fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); - if (!fi_args) - return -ENOMEM; + fi_args = memdup_user(arg, sizeof(*fi_args)); + if (IS_ERR(fi_args)) + return PTR_ERR(fi_args); + + flags_in = fi_args->flags; + memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; @@ -3237,6 +3241,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; + if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { + fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); + fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); + fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e6b6cb0f8bc6..24f6848ad78e 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -243,6 +243,13 @@ struct btrfs_ioctl_dev_info_args { __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ }; +/* + * Retrieve information about the filesystem + */ + +/* Request information about checksum type and size */ +#define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) + struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ __u64 num_devices; /* out */ @@ -250,8 +257,11 @@ struct btrfs_ioctl_fs_info_args { __u32 nodesize; /* out */ __u32 sectorsize; /* out */ __u32 clone_alignment; /* out */ - __u32 reserved32; - __u64 reserved[122]; /* pad to 1k */ + /* See BTRFS_FS_INFO_FLAG_* */ + __u16 csum_type; /* out */ + __u16 csum_size; /* out */ + __u64 flags; /* in/out */ + __u8 reserved[968]; /* pad to 1k */ }; /* From 0fb408a558aadbaa58beb75b02c95741e1fbb514 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:28:59 +0900 Subject: [PATCH 125/151] btrfs: add filesystem generation to FS_INFO ioctl Add retrieval of the filesystem's generation to the fsinfo ioctl. This is driven by setting the BTRFS_FS_INFO_FLAG_GENERATION flag in btrfs_ioctl_fs_info_args::flags. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ include/uapi/linux/btrfs.h | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2854f4a40787..55dd20d0f9cb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3247,6 +3247,11 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; } + if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { + fi_args->generation = fs_info->generation; + fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 24f6848ad78e..9b82e01c191d 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -250,6 +250,9 @@ struct btrfs_ioctl_dev_info_args { /* Request information about checksum type and size */ #define BTRFS_FS_INFO_FLAG_CSUM_INFO (1 << 0) +/* Request information about filesystem generation */ +#define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) + struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ __u64 num_devices; /* out */ @@ -261,7 +264,8 @@ struct btrfs_ioctl_fs_info_args { __u16 csum_type; /* out */ __u16 csum_size; /* out */ __u64 flags; /* in/out */ - __u8 reserved[968]; /* pad to 1k */ + __u64 generation; /* out */ + __u8 reserved[960]; /* pad to 1k */ }; /* From 49bac897683340457a0ab1f76b924a1220bdb604 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 13 Jul 2020 21:29:00 +0900 Subject: [PATCH 126/151] btrfs: add metadata_uuid to FS_INFO ioctl Add retrieval of the filesystem's metadata UUID to the fsinfo ioctl. This is driven by setting the BTRFS_FS_INFO_FLAG_METADATA_UUID flag in btrfs_ioctl_fs_info_args::flags. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 ++++++ include/uapi/linux/btrfs.h | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 55dd20d0f9cb..b4ddf51ae377 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3252,6 +3252,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } + if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { + memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, + sizeof(fi_args->metadata_uuid)); + fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; + } + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9b82e01c191d..2c39d15a2beb 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -252,6 +252,8 @@ struct btrfs_ioctl_dev_info_args { /* Request information about filesystem generation */ #define BTRFS_FS_INFO_FLAG_GENERATION (1 << 1) +/* Request information about filesystem metadata UUID */ +#define BTRFS_FS_INFO_FLAG_METADATA_UUID (1 << 2) struct btrfs_ioctl_fs_info_args { __u64 max_id; /* out */ @@ -265,7 +267,8 @@ struct btrfs_ioctl_fs_info_args { __u16 csum_size; /* out */ __u64 flags; /* in/out */ __u64 generation; /* out */ - __u8 reserved[960]; /* pad to 1k */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; /* out */ + __u8 reserved[944]; /* pad to 1k */ }; /* From d85327b1d8b74ec168ed34c798a424de82fdc001 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 8 Jul 2020 22:55:14 +0200 Subject: [PATCH 127/151] btrfs: prefetch chunk tree leaves at mount The whole chunk tree is read at mount time so we can utilize readahead to get the tree blocks to memory before we read the items. The idea is from Robbie, but instead of updating search slot readahead, this patch implements the chunk tree readahead manually from nodes on level 1. We've decided to do specific readahead optimizations and then unify them under a common API so we don't break everything by changing the search slot readahead logic. Higher chunk trees grow on large filesystems (many terabytes), and prefetching just level 1 seems to be sufficient. Provided example was from a 200TiB filesystem with chunk tree level 2. CC: Robbie Ko Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19213347ec70..c1a4dab8e811 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7013,6 +7013,19 @@ out: return ret; } +static void readahead_tree_node_children(struct extent_buffer *node) +{ + int i; + const int nr_items = btrfs_header_nritems(node); + + for (i = 0; i < nr_items; i++) { + u64 start; + + start = btrfs_node_blockptr(node, i); + readahead_tree_block(node->fs_info, start); + } +} + int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; @@ -7023,6 +7036,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) int ret; int slot; u64 total_dev = 0; + u64 last_ra_node = 0; path = btrfs_alloc_path(); if (!path) @@ -7056,6 +7070,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (ret < 0) goto error; while (1) { + struct extent_buffer *node; + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { @@ -7066,6 +7082,17 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) goto error; break; } + /* + * The nodes on level 1 are not locked but we don't need to do + * that during mount time as nothing else can access the tree + */ + node = path->nodes[1]; + if (node) { + if (last_ra_node != node->start) { + readahead_tree_node_children(node); + last_ra_node = node->start; + } + } btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; From d60ba8de1164e1b42e296ff270c622a070ef8fe7 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 7 Jul 2020 06:29:08 -0700 Subject: [PATCH 128/151] btrfs: ref-verify: fix memory leak in add_block_entry clang static analysis flags this error fs/btrfs/ref-verify.c:290:3: warning: Potential leak of memory pointed to by 're' [unix.Malloc] kfree(be); ^~~~~ The problem is in this block of code: if (root_objectid) { struct root_entry *exist_re; exist_re = insert_root_entry(&exist->roots, re); if (exist_re) kfree(re); } There is no 'else' block freeing when root_objectid is 0. Add the missing kfree to the else branch. Fixes: fd708b81d972 ("Btrfs: add a extent ref verify tool") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Tom Rix Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index af92525dbb16..7f03dbe5b609 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, exist_re = insert_root_entry(&exist->roots, re); if (exist_re) kfree(re); + } else { + kfree(re); } kfree(be); return exist; From a3cf0e4342b6af9e6b34a4b913c630fbd03a82ea Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 14 Jul 2020 09:12:20 +0800 Subject: [PATCH 129/151] btrfs: qgroup: free per-trans reserved space when a subvolume gets dropped [BUG] Sometime fsstress could lead to qgroup warning for case like generic/013: BTRFS warning (device dm-3): qgroup 0/259 has unreleased space, type 1 rsv 81920 ------------[ cut here ]------------ WARNING: CPU: 9 PID: 24535 at fs/btrfs/disk-io.c:4142 close_ctree+0x1dc/0x323 [btrfs] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:close_ctree+0x1dc/0x323 [btrfs] Call Trace: btrfs_put_super+0x15/0x17 [btrfs] generic_shutdown_super+0x72/0x110 kill_anon_super+0x18/0x30 btrfs_kill_super+0x17/0x30 [btrfs] deactivate_locked_super+0x3b/0xa0 deactivate_super+0x40/0x50 cleanup_mnt+0x135/0x190 __cleanup_mnt+0x12/0x20 task_work_run+0x64/0xb0 __prepare_exit_to_usermode+0x1bc/0x1c0 __syscall_return_slowpath+0x47/0x230 do_syscall_64+0x64/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace 6c341cdf9b6cc3c1 ]--- BTRFS error (device dm-3): qgroup reserved space leaked While that subvolume 259 is no longer in that filesystem. [CAUSE] Normally per-trans qgroup reserved space is freed when a transaction is committed, in commit_fs_roots(). However for completely dropped subvolume, that subvolume is completely gone, thus is no longer in the fs_roots_radix, and its per-trans reserved qgroup will never be freed. Since the subvolume is already gone, leaked per-trans space won't cause any trouble for end users. [FIX] Just call btrfs_qgroup_free_meta_all_pertrans() before a subvolume is completely dropped. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0bc35f932bf..96223813b618 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5466,6 +5466,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) } } + /* + * This subvolume is going to be completely dropped, and won't be + * recorded as dirty roots, thus pertrans meta rsv will not be freed at + * commit transaction time. So free it here manually. + */ + btrfs_qgroup_convert_reserved_meta(root, INT_MAX); + btrfs_qgroup_free_meta_all_pertrans(root); + if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else From 4faf55b03823e96c44dc4e364520000ed3b12fdb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 10 Jul 2020 14:37:38 +0800 Subject: [PATCH 130/151] btrfs: don't traverse into the seed devices in show_devname ->show_devname currently shows the lowest devid in the list. As the seed devices have the lowest devid in the sprouted filesystem, the userland tool such as findmnt end up seeing seed device instead of the device from the read-writable sprouted filesystem. As shown below. mount /dev/sda /btrfs mount: /btrfs: WARNING: device write-protected, mounted read-only. findmnt --output SOURCE,TARGET,UUID /btrfs SOURCE TARGET UUID /dev/sda /btrfs 899f7027-3e46-4626-93e7-7d4c9ad19111 btrfs dev add -f /dev/sdb /btrfs umount /btrfs mount /dev/sdb /btrfs findmnt --output SOURCE,TARGET,UUID /btrfs SOURCE TARGET UUID /dev/sda /btrfs 899f7027-3e46-4626-93e7-7d4c9ad19111 All sprouts from a single seed will show the same seed device and the same fsid. That's confusing. This is causing problems in our prototype as there isn't any reference to the sprout file-system(s) which is being used for actual read and write. This was added in the patch which implemented the show_devname in btrfs commit 9c5085c14798 ("Btrfs: implement ->show_devname"). I tried to look for any particular reason that we need to show the seed device, there isn't any. So instead, do not traverse through the seed devices, just show the lowest devid in the sprouted fsid. After the patch: mount /dev/sda /btrfs mount: /btrfs: WARNING: device write-protected, mounted read-only. findmnt --output SOURCE,TARGET,UUID /btrfs SOURCE TARGET UUID /dev/sda /btrfs 899f7027-3e46-4626-93e7-7d4c9ad19111 btrfs dev add -f /dev/sdb /btrfs mount -o rw,remount /dev/sdb /btrfs findmnt --output SOURCE,TARGET,UUID /btrfs SOURCE TARGET UUID /dev/sdb /btrfs 595ca0e6-b82e-46b5-b9e2-c72a6928be48 mount /dev/sda /btrfs1 mount: /btrfs1: WARNING: device write-protected, mounted read-only. btrfs dev add -f /dev/sdc /btrfs1 findmnt --output SOURCE,TARGET,UUID /btrfs1 SOURCE TARGET UUID /dev/sdc /btrfs1 ca1dbb7a-8446-4f95-853c-a20f3f82bdbb cat /proc/self/mounts | grep btrfs /dev/sdb /btrfs btrfs rw,relatime,noacl,space_cache,subvolid=5,subvol=/ 0 0 /dev/sdc /btrfs1 btrfs ro,relatime,noacl,space_cache,subvolid=5,subvol=/ 0 0 Reported-by: Martin K. Petersen CC: stable@vger.kernel.org # 4.19+ Tested-by: Martin K. Petersen Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7c052a542412..58f890f73650 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2351,9 +2351,7 @@ static int btrfs_unfreeze(struct super_block *sb) static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_fs_devices *cur_devices; struct btrfs_device *dev, *first_dev = NULL; - struct list_head *head; /* * Lightweight locking of the devices. We should not need @@ -2363,18 +2361,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * least until the rcu_read_unlock. */ rcu_read_lock(); - cur_devices = fs_info->fs_devices; - while (cur_devices) { - head = &cur_devices->devices; - list_for_each_entry_rcu(dev, head, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) - continue; - if (!dev->name) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - cur_devices = cur_devices->seed; + list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!dev->name) + continue; + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; } if (first_dev) From f37c563bab4297024c300b05c8f48430e323809d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 10 Jul 2020 09:49:56 +0200 Subject: [PATCH 131/151] btrfs: add missing check for nocow and compression inode flags User Forza reported on IRC that some invalid combinations of file attributes are accepted by chattr. The NODATACOW and compression file flags/attributes are mutually exclusive, but they could be set by 'chattr +c +C' on an empty file. The nodatacow will be in effect because it's checked first in btrfs_run_delalloc_range. Extend the flag validation to catch the following cases: - input flags are conflicting - old and new flags are conflicting - initialize the local variable with inode flags after inode ls locked Inode attributes take precedence over mount options and are an independent setting. Nocompress would be a no-op with nodatacow, but we don't want to mix any compression-related options with nodatacow. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b4ddf51ae377..bd3511c5ca81 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -164,8 +164,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } -/* Check if @flags are a supported and valid set of FS_*_FL flags */ -static int check_fsflags(unsigned int flags) +/* + * Check if @flags are a supported and valid set of FS_*_FL flags and that + * the old and new flags are not conflicting + */ +static int check_fsflags(unsigned int old_flags, unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ @@ -174,9 +177,19 @@ static int check_fsflags(unsigned int flags) FS_NOCOW_FL)) return -EOPNOTSUPP; + /* COMPR and NOCOMP on new/old are valid */ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) return -EINVAL; + if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) + return -EINVAL; + + /* NOCOW and compression options are mutually exclusive */ + if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) + return -EINVAL; + return 0; } @@ -190,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags = binode->flags; + u32 binode_flags; if (!inode_owner_or_capable(inode)) return -EPERM; @@ -201,22 +214,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (copy_from_user(&fsflags, arg, sizeof(fsflags))) return -EFAULT; - ret = check_fsflags(fsflags); - if (ret) - return ret; - ret = mnt_want_write_file(file); if (ret) return ret; inode_lock(inode); - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); if (ret) goto out_unlock; + ret = check_fsflags(old_fsflags, fsflags); + if (ret) + goto out_unlock; + + binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; else From 813f8a0e268bbc9bee3667f68ab9a3d8ea9df3a1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 14:02:17 +0300 Subject: [PATCH 132/151] btrfs: raid56: remove out label in __raid56_parity_recover There's no cleanup that occurs so we can simply return 0 directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 61ff77392357..255490f42b5d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2083,7 +2083,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) */ if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { __raid_recover_end_io(rbio); - goto out; + return 0; } else { goto cleanup; } @@ -2103,7 +2103,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) submit_bio(bio); } -out: + return 0; cleanup: From 5cb502f4ab6469c3d02bd787a3ff58a7b624ba67 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 09:03:19 +0800 Subject: [PATCH 133/151] btrfs: relocation: allow signal to cancel balance Although btrfs balance can be canceled with "btrfs balance cancel" command, it's still almost muscle memory to press Ctrl-C to cancel a long running btrfs balance. So allow btrfs balance to check signal to determine if it should exit. The cancellation points are in known location and we're only adding one more reason, so this should be safe. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 523d2e5fab8f..2b869fb2e62c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2656,7 +2656,8 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, */ int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { - return atomic_read(&fs_info->balance_cancel_req); + return atomic_read(&fs_info->balance_cancel_req) || + fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); From f3e3d9cc35252a70a2fd698762c9687718268ec6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 09:03:20 +0800 Subject: [PATCH 134/151] btrfs: avoid possible signal interruption of btrfs_drop_snapshot() on relocation tree [BUG] There is a bug report about bad signal timing could lead to read-only fs during balance: BTRFS info (device xvdb): balance: start -d -m -s BTRFS info (device xvdb): relocating block group 73001861120 flags metadata BTRFS info (device xvdb): found 12236 extents, stage: move data extents BTRFS info (device xvdb): relocating block group 71928119296 flags data BTRFS info (device xvdb): found 3 extents, stage: move data extents BTRFS info (device xvdb): found 3 extents, stage: update data pointers BTRFS info (device xvdb): relocating block group 60922265600 flags metadata BTRFS: error (device xvdb) in btrfs_drop_snapshot:5505: errno=-4 unknown BTRFS info (device xvdb): forced readonly BTRFS info (device xvdb): balance: ended with status: -4 [CAUSE] The direct cause is the -EINTR from the following call chain when a fatal signal is pending: relocate_block_group() |- clean_dirty_subvols() |- btrfs_drop_snapshot() |- btrfs_start_transaction() |- btrfs_delayed_refs_rsv_refill() |- btrfs_reserve_metadata_bytes() |- __reserve_metadata_bytes() |- wait_reserve_ticket() |- prepare_to_wait_event(); |- ticket->error = -EINTR; Normally this behavior is fine for most btrfs_start_transaction() callers, as they need to catch any other error, same for the signal, and exit ASAP. However for balance, especially for the clean_dirty_subvols() case, we're already doing cleanup works, getting -EINTR from btrfs_drop_snapshot() could cause a lot of unexpected problems. From the mentioned forced read-only report, to later balance error due to half dropped reloc trees. [FIX] Fix this problem by using btrfs_join_transaction() if btrfs_drop_snapshot() is called from relocation context. Since btrfs_join_transaction() won't get interrupted by signal, we can continue the cleanup. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba 3 Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96223813b618..61ede335f6c3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5298,7 +5298,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) goto out; } - trans = btrfs_start_transaction(tree_root, 0); + /* + * Use join to avoid potential EINTR from transaction start. See + * wait_reserve_ticket and the whole reservation callchain. + */ + if (for_reloc) + trans = btrfs_join_transaction(tree_root); + else + trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_free; From 44d354abf33e92a5e73b965c84caf5a5d5e58a0b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 09:03:21 +0800 Subject: [PATCH 135/151] btrfs: relocation: review the call sites which can be interrupted by signal Since most metadata reservation calls can return -EINTR when get interrupted by fatal signal, we need to review the all the metadata reservation call sites. In relocation code, the metadata reservation happens in the following sites: - btrfs_block_rsv_refill() in merge_reloc_root() merge_reloc_root() is a pretty critical section, we don't want to be interrupted by signal, so change the flush status to BTRFS_RESERVE_FLUSH_LIMIT, so it won't get interrupted by signal. Since such change can be ENPSPC-prone, also shrink the amount of metadata to reserve least amount avoid deadly ENOSPC there. - btrfs_block_rsv_refill() in reserve_metadata_space() It calls with BTRFS_RESERVE_FLUSH_LIMIT, which won't get interrupted by signal. - btrfs_block_rsv_refill() in prepare_to_relocate() - btrfs_block_rsv_add() in prepare_to_relocate() - btrfs_block_rsv_refill() in relocate_block_group() - btrfs_delalloc_reserve_metadata() in relocate_file_extent_cluster() - btrfs_start_transaction() in relocate_block_group() - btrfs_start_transaction() in create_reloc_inode() Can be interrupted by fatal signal and we can handle it easily. For these call sites, just catch the -EINTR value in btrfs_balance() and count them as canceled. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 12 ++++++++++-- fs/btrfs/volumes.c | 17 ++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2b869fb2e62c..4ba1ab9cc76d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1686,12 +1686,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + /* + * In merge_reloc_root(), we modify the upper level pointer to swap the + * tree blocks between reloc tree and subvolume tree. Thus for tree + * block COW, we COW at most from level 1 to root level for each tree. + * + * Thus the needed metadata size is at most root_level * nodesize, + * and * 2 since we have two trees to COW. + */ + min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { err = ret; goto out; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c1a4dab8e811..537ccf66ee20 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4135,7 +4135,22 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, mutex_lock(&fs_info->balance_mutex); if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) btrfs_info(fs_info, "balance: paused"); - else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + /* + * Balance can be canceled by: + * + * - Regular cancel request + * Then ret == -ECANCELED and balance_cancel_req > 0 + * + * - Fatal signal to "btrfs" process + * Either the signal caught by wait_reserve_ticket() and callers + * got -EINTR, or caught by btrfs_should_cancel_balance() and + * got -ECANCELED. + * Either way, in this case balance_cancel_req = 0, and + * ret == -EINTR or ret == -ECANCELED. + * + * So here we only check the return value to catch canceled balance. + */ + else if (ret == -ECANCELED || ret == -EINTR) btrfs_info(fs_info, "balance: canceled"); else btrfs_info(fs_info, "balance: ended with status: %d", ret); From fd7fb634d69a7d8c91aa3166b77351b96954aae4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jul 2020 09:03:22 +0800 Subject: [PATCH 136/151] btrfs: add comments for btrfs_reserve_flush_enum This enum is the interface exposed to developers. Although we have a detailed comment explaining the whole idea of space flushing at the beginning of space-info.c, the exposed enum interface doesn't have any comment. Some corner cases, like BTRFS_RESERVE_FLUSH_ALL and BTRFS_RESERVE_FLUSH_ALL_STEAL can be interrupted by fatal signals, are not explained at all. So add some simple comments for these enums as a quick reference. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 050760dfc56d..b70c2024296f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2560,16 +2560,46 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ enum btrfs_reserve_flush_enum { /* If we are in the transaction, we can't flush anything.*/ BTRFS_RESERVE_NO_FLUSH, + /* - * Flushing delalloc may cause deadlock somewhere, in this - * case, use FLUSH LIMIT + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk */ BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + */ BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Commiting transaction + * + * Can be interruped by fatal signal. + */ BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interruped by fatal signal. + */ BTRFS_RESERVE_FLUSH_ALL_STEAL, }; From b69d1ee923ec59533672ff4953ab45c57576bce0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 16 Jul 2020 18:17:19 +0300 Subject: [PATCH 137/151] btrfs: remove done label in writepage_delalloc Since there is not common cleanup run after the label it makes it somewhat redundant. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e37950a576fc..73c9c59cd535 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3446,8 +3446,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * started, so we don't want to return > 0 unless * things are going well. */ - ret = ret < 0 ? ret : -EIO; - goto done; + return ret < 0 ? ret : -EIO; } /* * delalloc_end is already one less than the total length, so @@ -3479,10 +3478,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, return 1; } - ret = 0; - -done: - return ret; + return 0; } /* From 3ebac17ce593490bff48d8eb0b4b97b97d8609fa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 15 Jul 2020 12:30:43 +0100 Subject: [PATCH 138/151] btrfs: reduce contention on log trees when logging checksums The possibility of extents being shared (through clone and deduplication operations) requires special care when logging data checksums, to avoid having a log tree with different checksum items that cover ranges which overlap (which resulted in missing checksums after replaying a log tree). Such problems were fixed in the past by the following commits: commit 40e046acbd2f ("Btrfs: fix missing data checksums after replaying a log tree") commit e289f03ea79b ("btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents") Test case generic/588 exercises the scenario solved by the first commit (purely sequential and deterministic) while test case generic/457 often triggered the case fixed by the second commit (not deterministic, requires specific timings under concurrency). The problems were addressed by deleting, from the log tree, any existing checksums before logging the new ones. And also by doing the deletion and logging of the cheksums while locking the checksum range in an extent io tree (root->log_csum_range), to deal with the case where we have concurrent fsyncs against files with shared extents. That however causes more contention on the leaves of a log tree where we store checksums (and all the nodes in the paths leading to them), even when we do not have shared extents, or all the shared extents were created by past transactions. It also adds a bit of contention on the spin lock of the log_csums_range extent io tree of the log root. This change adds a 'last_reflink_trans' field to the inode to keep track of the last transaction where a new extent was shared between inodes (through clone and deduplication operations). It is updated for both the source and destination inodes of reflink operations whenever a new extent (created in the current transaction) becomes shared by the inodes. This field is kept in memory only, not persisted in the inode item, similar to other existing fields (last_unlink_trans, logged_trans). When logging checksums for an extent, if the value of 'last_reflink_trans' is smaller then the current transaction's generation/id, we skip locking the extent range and deletion of checksums from the log tree, since we know we do not have new shared extents. This reduces contention on the log tree's leaves where checksums are stored. The following script, which uses fio, was used to measure the impact of this change: $ cat test-fsync.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-d single -m single" if [ $# -ne 3 ]; then echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ" exit 1 fi NUM_JOBS=$1 FILE_SIZE=$2 FSYNC_FREQ=$3 cat < /tmp/fio-job.ini [writers] rw=write fsync=$FSYNC_FREQ fallocate=none group_reporting=1 direct=0 bs=64k ioengine=sync size=$FILE_SIZE directory=$MNT numjobs=$NUM_JOBS EOF echo "Using config:" echo cat /tmp/fio-job.ini echo mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT The tests were performed for different numbers of jobs, file sizes and fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has 12 cores, with cpu governance set to performance mode on all cores), 16GiB of ram (the host has 64GiB) and using a NVMe device directly (without an intermediary filesystem in the host). While running the tests, the host was not used for anything else, to avoid disturbing the tests. The obtained results were the following (the last line of fio's output was pasted). Starting with 16 jobs is where a significant difference is observable in this particular setup and hardware (differences highlighted below). The very small differences for tests with less than 16 jobs are possibly just noise and random. **** 1 job, file size 1G, fsync frequency 1 **** before this change: WRITE: bw=23.8MiB/s (24.9MB/s), 23.8MiB/s-23.8MiB/s (24.9MB/s-24.9MB/s), io=1024MiB (1074MB), run=43075-43075msec after this change: WRITE: bw=24.4MiB/s (25.6MB/s), 24.4MiB/s-24.4MiB/s (25.6MB/s-25.6MB/s), io=1024MiB (1074MB), run=41938-41938msec **** 2 jobs, file size 1G, fsync frequency 1 **** before this change: WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54351-54351msec after this change: WRITE: bw=37.7MiB/s (39.5MB/s), 37.6MiB/s-37.6MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54428-54428msec **** 4 jobs, file size 1G, fsync frequency 1 **** before this change: WRITE: bw=67.5MiB/s (70.8MB/s), 67.5MiB/s-67.5MiB/s (70.8MB/s-70.8MB/s), io=4096MiB (4295MB), run=60669-60669msec after this change: WRITE: bw=68.6MiB/s (71.0MB/s), 68.6MiB/s-68.6MiB/s (71.0MB/s-71.0MB/s), io=4096MiB (4295MB), run=59678-59678msec **** 8 jobs, file size 1G, fsync frequency 1 **** before this change: WRITE: bw=128MiB/s (134MB/s), 128MiB/s-128MiB/s (134MB/s-134MB/s), io=8192MiB (8590MB), run=64048-64048msec after this change: WRITE: bw=129MiB/s (135MB/s), 129MiB/s-129MiB/s (135MB/s-135MB/s), io=8192MiB (8590MB), run=63405-63405msec **** 16 jobs, file size 1G, fsync frequency 1 **** before this change: WRITE: bw=78.5MiB/s (82.3MB/s), 78.5MiB/s-78.5MiB/s (82.3MB/s-82.3MB/s), io=16.0GiB (17.2GB), run=208676-208676msec after this change: WRITE: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s), io=16.0GiB (17.2GB), run=149295-149295msec (+40.1% throughput, -28.5% runtime) **** 32 jobs, file size 1G, fsync frequency 1 **** before this change: WRITE: bw=58.8MiB/s (61.7MB/s), 58.8MiB/s-58.8MiB/s (61.7MB/s-61.7MB/s), io=32.0GiB (34.4GB), run=557134-557134msec after this change: WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430550-430550msec (+29.4% throughput, -22.7% runtime) **** 64 jobs, file size 512M, fsync frequency 1 **** before this change: WRITE: bw=65.8MiB/s (68.0MB/s), 65.8MiB/s-65.8MiB/s (68.0MB/s-68.0MB/s), io=32.0GiB (34.4GB), run=498055-498055msec after this change: WRITE: bw=85.1MiB/s (89.2MB/s), 85.1MiB/s-85.1MiB/s (89.2MB/s-89.2MB/s), io=32.0GiB (34.4GB), run=385116-385116msec (+29.3% throughput, -22.7% runtime) **** 128 jobs, file size 256M, fsync frequency 1 **** before this change: WRITE: bw=54.7MiB/s (57.3MB/s), 54.7MiB/s-54.7MiB/s (57.3MB/s-57.3MB/s), io=32.0GiB (34.4GB), run=599373-599373msec after this change: WRITE: bw=121MiB/s (126MB/s), 121MiB/s-121MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=271907-271907msec (+121.2% throughput, -54.6% runtime) **** 256 jobs, file size 256M, fsync frequency 1 **** before this change: WRITE: bw=69.2MiB/s (72.5MB/s), 69.2MiB/s-69.2MiB/s (72.5MB/s-72.5MB/s), io=64.0GiB (68.7GB), run=947536-947536msec after this change: WRITE: bw=121MiB/s (127MB/s), 121MiB/s-121MiB/s (127MB/s-127MB/s), io=64.0GiB (68.7GB), run=541916-541916msec (+74.9% throughput, -42.8% runtime) **** 512 jobs, file size 128M, fsync frequency 1 **** before this change: WRITE: bw=85.4MiB/s (89.5MB/s), 85.4MiB/s-85.4MiB/s (89.5MB/s-89.5MB/s), io=64.0GiB (68.7GB), run=767734-767734msec after this change: WRITE: bw=141MiB/s (147MB/s), 141MiB/s-141MiB/s (147MB/s-147MB/s), io=64.0GiB (68.7GB), run=466022-466022msec (+65.1% throughput, -39.3% runtime) **** 1024 jobs, file size 128M, fsync frequency 1 **** before this change: WRITE: bw=115MiB/s (120MB/s), 115MiB/s-115MiB/s (120MB/s-120MB/s), io=128GiB (137GB), run=1143775-1143775msec after this change: WRITE: bw=171MiB/s (180MB/s), 171MiB/s-171MiB/s (180MB/s-180MB/s), io=128GiB (137GB), run=764843-764843msec (+48.7% throughput, -33.1% runtime) Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 11 +++++++++++ fs/btrfs/inode.c | 9 +++++++++ fs/btrfs/reflink.c | 15 +++++++++++++++ fs/btrfs/tree-log.c | 13 +++++++++++-- 4 files changed, 46 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e7d709505cb1..c47b6c6fea9f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -151,6 +151,17 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * The id/generation of the last transaction where this inode was + * either the source or the destination of a clone/dedupe operation. + * Used when logging an inode to know if there are shared extents that + * need special care when logging checksum items, to avoid duplicate + * checksum items in a log (which can lead to a corruption where we end + * up with missing checksum ranges after log replay). + * Protected by the vfs inode lock. + */ + u64 last_reflink_trans; + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0fa4f7007ff9..611b3412fbfd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3336,6 +3336,14 @@ cache_index: */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Same logic as for last_unlink_trans. We don't persist the generation + * of the last transaction where this inode was used for a reflink + * operation, so after eviction and reloading the inode we must be + * pessimistic and assume the last transaction that modified the inode. + */ + BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -8550,6 +8558,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_reflink_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 834eb6d98caa..5cd02514cf4d 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -337,6 +337,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, while (1) { u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; + u64 extent_gen; int type; u32 size; struct btrfs_key new_key; @@ -385,6 +386,7 @@ process_slot: extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + extent_gen = btrfs_file_extent_generation(leaf, extent); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); if (type == BTRFS_FILE_EXTENT_REG || @@ -489,6 +491,19 @@ process_slot: btrfs_release_path(path); + /* + * If this is a new extent update the last_reflink_trans of both + * inodes. This is used by fsync to make sure it does not log + * multiple checksum items with overlapping ranges. For older + * extents we don't need to do it since inode logging skips the + * checksums for older extents. Also ignore holes and inline + * extents because they don't have checksums in the csum tree. + */ + if (extent_gen == trans->transid && disko > 0) { + BTRFS_I(src)->last_reflink_trans = trans->transid; + BTRFS_I(inode)->last_reflink_trans = trans->transid; + } + last_dest_end = ALIGN(new_key.offset + datal, fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index aaa449153d9c..ea8136dcf71f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3892,6 +3892,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, } static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { @@ -3899,6 +3900,14 @@ static int log_csums(struct btrfs_trans_handle *trans, struct extent_state *cached_state = NULL; int ret; + /* + * If this inode was not used for reflink operations in the current + * transaction with new extents, then do the fast path, no need to + * worry about logging checksum items with overlapping ranges. + */ + if (inode->last_reflink_trans < trans->transid) + return btrfs_csum_file_blocks(trans, log_root, sums); + /* * Serialize logging for checksums. This is to avoid racing with the * same checksum being logged by another task that is logging another @@ -4050,7 +4059,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = log_csums(trans, log, sums); + ret = log_csums(trans, inode, log, sums); list_del(&sums->list); kfree(sums); } @@ -4109,7 +4118,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = log_csums(trans, log_root, sums); + ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); kfree(sums); } From f95ebdbed46a4d8b9fdb7bff109fdbb6fc9a6dc8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 11:24:27 -0400 Subject: [PATCH 139/151] btrfs: don't WARN if we abort a transaction with EROFS If we got some sort of corruption via a read and call btrfs_handle_fs_error() we'll set BTRFS_FS_STATE_ERROR on the fs and complain. If a subsequent trans handle trips over this it'll get EROFS and then abort. However at that point we're not aborting for the original reason, we're aborting because we've been flipped read only. We do not need to WARN_ON() here. CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b70c2024296f..9c7e466f27a9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3279,7 +3279,7 @@ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - if ((errno) != -EIO) { \ + if ((errno) != -EIO && (errno) != -EROFS) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ From 59131393434b49a313cfeb11ae880e13149d13b6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 11:24:28 -0400 Subject: [PATCH 140/151] btrfs: document special case error codes for fs errors We've had some discussions about what to do in certain scenarios for error codes, specifically EUCLEAN and EROFS. Document these near the error handling code so its clear what their intentions are. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 58f890f73650..a4f0bb29b8d6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -67,6 +67,21 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ const char * __attribute_const__ btrfs_decode_error(int errno) { char *errstr = "unknown"; From fbabd4a36faaf74c83142d0b3d950c11ec14fda1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:38:37 -0400 Subject: [PATCH 141/151] btrfs: return EROFS for BTRFS_FS_STATE_ERROR cases Eric reported seeing this message while running generic/475 BTRFS: error (device dm-3) in btrfs_sync_log:3084: errno=-117 Filesystem corrupted Full stack trace: BTRFS: error (device dm-0) in btrfs_commit_transaction:2323: errno=-5 IO failure (Error while writing out transaction) BTRFS info (device dm-0): forced readonly BTRFS warning (device dm-0): Skipping commit of aborted transaction. ------------[ cut here ]------------ BTRFS: error (device dm-0) in cleanup_transaction:1894: errno=-5 IO failure BTRFS: Transaction aborted (error -117) BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6480 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6488 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6490 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c6498 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64a0 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64a8 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64b0 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64b8 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3555 rw 0,0 sector 0x1c64c0 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3572 rw 0,0 sector 0x1b85e8 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3572 rw 0,0 sector 0x1b85f0 len 4096 err no 10 WARNING: CPU: 3 PID: 23985 at fs/btrfs/tree-log.c:3084 btrfs_sync_log+0xbc8/0xd60 [btrfs] BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4288 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4290 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d4298 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42a0 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42a8 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42b0 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42b8 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42c0 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42c8 len 4096 err no 10 BTRFS warning (device dm-0): direct IO failed ino 3548 rw 0,0 sector 0x1d42d0 len 4096 err no 10 CPU: 3 PID: 23985 Comm: fsstress Tainted: G W L 5.8.0-rc4-default+ #1181 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 RIP: 0010:btrfs_sync_log+0xbc8/0xd60 [btrfs] RSP: 0018:ffff909a44d17bd0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001 RDX: ffff8f3be41cb940 RSI: ffffffffb0108d2b RDI: ffffffffb0108ff7 RBP: ffff909a44d17e70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000037988 R12: ffff8f3bd20e4000 R13: ffff8f3bd20e4428 R14: 00000000ffffff8b R15: ffff909a44d17c70 FS: 00007f6a6ed3fb80(0000) GS:ffff8f3c3dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6a6ed3e000 CR3: 00000000525c0003 CR4: 0000000000160ee0 Call Trace: ? finish_wait+0x90/0x90 ? __mutex_unlock_slowpath+0x45/0x2a0 ? lock_acquire+0xa3/0x440 ? lockref_put_or_lock+0x9/0x30 ? dput+0x20/0x4a0 ? dput+0x20/0x4a0 ? do_raw_spin_unlock+0x4b/0xc0 ? _raw_spin_unlock+0x1f/0x30 btrfs_sync_file+0x335/0x490 [btrfs] do_fsync+0x38/0x70 __x64_sys_fsync+0x10/0x20 do_syscall_64+0x50/0xe0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f6a6ef1b6e3 Code: Bad RIP value. RSP: 002b:00007ffd01e20038 EFLAGS: 00000246 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 000000000007a120 RCX: 00007f6a6ef1b6e3 RDX: 00007ffd01e1ffa0 RSI: 00007ffd01e1ffa0 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000001 R09: 00007ffd01e2004c R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000009f R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x67b/0x1b00 softirqs last enabled at (0): [] copy_process+0x67b/0x1b00 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace af146e0e38433456 ]--- BTRFS: error (device dm-0) in btrfs_sync_log:3084: errno=-117 Filesystem corrupted This ret came from btrfs_write_marked_extents(). If we get an aborted transaction via EIO before, we'll see it in btree_write_cache_pages() and return EUCLEAN, which gets printed as "Filesystem corrupted". Except we shouldn't be returning EUCLEAN here, we need to be returning EROFS because EUCLEAN is reserved for actual corruption, not IO errors. We are inconsistent about our handling of BTRFS_FS_STATE_ERROR elsewhere, but we want to use EROFS for this particular case. The original transaction abort has the real error code for why we ended up with an aborted transaction, all subsequent actions just need to return EROFS because they may not have a trans handle and have no idea about the original cause of the abort. After patch "btrfs: don't WARN if we abort a transaction with EROFS" the stacktrace will not be dumped either. Reported-by: Eric Sandeen CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add full test stacktrace ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/transaction.c | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 73c9c59cd535..3fbc37692592 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4119,7 +4119,7 @@ retry: if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { ret = flush_write_bio(&epd); } else { - ret = -EUCLEAN; + ret = -EROFS; end_write_bio(&epd, ret); } return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d935ac06323f..5a6cb9db512e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3691,7 +3691,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - return -EIO; + return -EROFS; /* Seed devices of a new filesystem has their own generation. */ if (scrub_dev->fs_devices != fs_info->fs_devices) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index efafc286323c..20c6ac1a5de7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -937,7 +937,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans) || test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); - err = -EIO; + if (TRANS_ABORTED(trans)) + err = trans->aborted; + else + err = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); From a47bd78d0c44621efb98b525d04d60dc4d1a79b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:17:50 -0400 Subject: [PATCH 142/151] btrfs: sysfs: use NOFS for device creation Dave hit this splat during testing btrfs/078: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc6-default+ #1191 Not tainted ------------------------------------------------------ kswapd0/75 is trying to acquire lock: ffffa040e9d04ff8 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] but task is already holding lock: ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}-{0:0}: __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 fs_reclaim_acquire.part.0+0x25/0x30 __kmalloc_track_caller+0x49/0x330 kstrdup+0x2e/0x60 __kernfs_new_node.constprop.0+0x44/0x250 kernfs_new_node+0x25/0x50 kernfs_create_link+0x34/0xa0 sysfs_do_create_link_sd+0x5e/0xd0 btrfs_sysfs_add_devices_dir+0x65/0x100 [btrfs] btrfs_init_new_device+0x44c/0x12b0 [btrfs] btrfs_ioctl+0xc3c/0x25c0 [btrfs] ksys_ioctl+0x68/0xa0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0xe0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 __mutex_lock+0xa0/0xaf0 btrfs_chunk_alloc+0x137/0x3e0 [btrfs] find_free_extent+0xb44/0xfb0 [btrfs] btrfs_reserve_extent+0x9b/0x180 [btrfs] btrfs_alloc_tree_block+0xc1/0x350 [btrfs] alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs] __btrfs_cow_block+0x143/0x7a0 [btrfs] btrfs_cow_block+0x15f/0x310 [btrfs] push_leaf_right+0x150/0x240 [btrfs] split_leaf+0x3cd/0x6d0 [btrfs] btrfs_search_slot+0xd14/0xf70 [btrfs] btrfs_insert_empty_items+0x64/0xc0 [btrfs] __btrfs_commit_inode_delayed_items+0xb2/0x840 [btrfs] btrfs_async_run_delayed_root+0x10e/0x1d0 [btrfs] btrfs_work_helper+0x2f9/0x650 [btrfs] process_one_work+0x22c/0x600 worker_thread+0x50/0x3b0 kthread+0x137/0x150 ret_from_fork+0x1f/0x30 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: check_prev_add+0x98/0xa20 validate_chain+0xa8c/0x2a00 __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 __mutex_lock+0xa0/0xaf0 __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] btrfs_evict_inode+0x3bf/0x560 [btrfs] evict+0xd6/0x1c0 dispose_list+0x48/0x70 prune_icache_sb+0x54/0x80 super_cache_scan+0x121/0x1a0 do_shrink_slab+0x175/0x420 shrink_slab+0xb1/0x2e0 shrink_node+0x192/0x600 balance_pgdat+0x31f/0x750 kswapd+0x206/0x510 kthread+0x137/0x150 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> &fs_info->chunk_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&fs_info->chunk_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/75: #0: ffffffff8b0c8040 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff8b0b50b8 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0 #2: ffffa040e057c0e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50 stack backtrace: CPU: 2 PID: 75 Comm: kswapd0 Not tainted 5.8.0-rc6-default+ #1191 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x16f/0x190 check_prev_add+0x98/0xa20 validate_chain+0xa8c/0x2a00 __lock_acquire+0x56f/0xaa0 lock_acquire+0xa3/0x440 ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] __mutex_lock+0xa0/0xaf0 ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] ? __lock_acquire+0x56f/0xaa0 ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] ? lock_acquire+0xa3/0x440 ? btrfs_evict_inode+0x138/0x560 [btrfs] ? btrfs_evict_inode+0x2fe/0x560 [btrfs] ? __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] __btrfs_release_delayed_node.part.0+0x3f/0x310 [btrfs] btrfs_evict_inode+0x3bf/0x560 [btrfs] evict+0xd6/0x1c0 dispose_list+0x48/0x70 prune_icache_sb+0x54/0x80 super_cache_scan+0x121/0x1a0 do_shrink_slab+0x175/0x420 shrink_slab+0xb1/0x2e0 shrink_node+0x192/0x600 balance_pgdat+0x31f/0x750 kswapd+0x206/0x510 ? _raw_spin_unlock_irqrestore+0x3e/0x50 ? finish_wait+0x90/0x90 ? balance_pgdat+0x750/0x750 kthread+0x137/0x150 ? kthread_stop+0x2a0/0x2a0 ret_from_fork+0x1f/0x30 This is because we're holding the chunk_mutex while adding this device and adding its sysfs entries. We actually hold different locks in different places when calling this function, the dev_replace semaphore for instance in dev replace, so instead of moving this call around simply wrap it's operations in NOFS. CC: stable@vger.kernel.org # 4.14+ Reported-by: David Sterba Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 38c0b95e0e7f..104c80caaa74 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1278,7 +1278,9 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, { int error = 0; struct btrfs_device *dev; + unsigned int nofs_flag; + nofs_flag = memalloc_nofs_save(); list_for_each_entry(dev, &fs_devices->devices, dev_list) { if (one_device && one_device != dev) @@ -1306,6 +1308,7 @@ int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, break; } } + memalloc_nofs_restore(nofs_flag); return error; } From 18c850fdc5a801bad4977b0f1723761d42267e45 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jul 2020 15:12:27 -0400 Subject: [PATCH 143/151] btrfs: open device without device_list_mutex There's long existed a lockdep splat because we open our bdev's under the ->device_list_mutex at mount time, which acquires the bd_mutex. Usually this goes unnoticed, but if you do loopback devices at all suddenly the bd_mutex comes with a whole host of other dependencies, which results in the splat when you mount a btrfs file system. ====================================================== WARNING: possible circular locking dependency detected 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Not tainted ------------------------------------------------------ systemd-journal/509 is trying to acquire lock: ffff970831f84db0 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x44/0x70 [btrfs] but task is already holding lock: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x13e/0x220 btrfs_page_mkwrite+0x59/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 -> #5 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x60/0x80 _copy_from_user+0x20/0xb0 get_sg_io_hdr+0x9a/0xb0 scsi_cmd_ioctl+0x1ea/0x2f0 cdrom_ioctl+0x3c/0x12b4 sr_block_ioctl+0xa4/0xd0 block_ioctl+0x3f/0x50 ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&cd->lock){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 sr_block_open+0xa2/0x180 __blkdev_get+0xdd/0x550 blkdev_get+0x38/0x150 do_dentry_open+0x16b/0x3e0 path_openat+0x3c9/0xa00 do_filp_open+0x75/0x100 do_sys_openat2+0x8a/0x140 __x64_sys_openat+0x46/0x70 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 __blkdev_get+0x6a/0x550 blkdev_get+0x85/0x150 blkdev_get_by_path+0x2c/0x70 btrfs_get_bdev_and_sb+0x1b/0xb0 [btrfs] open_fs_devices+0x88/0x240 [btrfs] btrfs_open_devices+0x92/0xa0 [btrfs] btrfs_mount_root+0x250/0x490 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 vfs_kern_mount.part.0+0x71/0xb0 btrfs_mount+0x119/0x380 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x8c6/0xca0 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_run_dev_stats+0x36/0x420 [btrfs] commit_cowonly_roots+0x91/0x2d0 [btrfs] btrfs_commit_transaction+0x4e6/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x7b/0x820 btrfs_commit_transaction+0x48e/0x9f0 [btrfs] btrfs_sync_file+0x38a/0x480 [btrfs] __x64_sys_fdatasync+0x47/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 __mutex_lock+0x7b/0x820 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by systemd-journal/509: #0: ffff97083bdec8b8 (&mm->mmap_lock#2){++++}-{3:3}, at: do_user_addr_fault+0x12e/0x4b0 #1: ffff97083144d598 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x59/0x560 [btrfs] #2: ffff97083144d6a8 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3f8/0x500 [btrfs] stack backtrace: CPU: 0 PID: 509 Comm: systemd-journal Not tainted 5.8.0-0.rc3.1.fc33.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack+0x92/0xc8 check_noncircular+0x134/0x150 __lock_acquire+0x1241/0x20c0 lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? lock_acquire+0xb0/0x400 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] __mutex_lock+0x7b/0x820 ? btrfs_record_root_in_trans+0x44/0x70 [btrfs] ? kvm_sched_clock_read+0x14/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0xc/0xb0 btrfs_record_root_in_trans+0x44/0x70 [btrfs] start_transaction+0xd2/0x500 [btrfs] btrfs_dirty_inode+0x44/0xd0 [btrfs] file_update_time+0xc6/0x120 btrfs_page_mkwrite+0xda/0x560 [btrfs] ? sched_clock+0x5/0x10 do_page_mkwrite+0x4f/0x130 do_wp_page+0x3b0/0x4f0 handle_mm_fault+0xf47/0x1850 do_user_addr_fault+0x1fc/0x4b0 exc_page_fault+0x88/0x300 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7fa3972fdbfe Code: Bad RIP value. Fix this by not holding the ->device_list_mutex at this point. The device_list_mutex exists to protect us from modifying the device list while the file system is running. However it can also be modified by doing a scan on a device. But this action is specifically protected by the uuid_mutex, which we are holding here. We cannot race with opening at this point because we have the ->s_mount lock held during the mount. Not having the ->device_list_mutex here is perfectly safe as we're not going to change the devices at this point. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add some comments ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 537ccf66ee20..084b8227ea2c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -245,7 +245,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * * global::fs_devs - add, remove, updates to the global list * - * does not protect: manipulation of the fs_devices::devices list! + * does not protect: manipulation of the fs_devices::devices list in general + * but in mount context it could be used to exclude list modifications by eg. + * scan ioctl * * btrfs_device::name - renames (write side), read is RCU * @@ -258,6 +260,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * may be used to exclude some operations from running concurrently without any * modifications to the list (see write_all_supers) * + * Is not required at mount and close times, because our device list is + * protected by the uuid_mutex at that point. + * * balance_mutex * ------------- * protects balance structures (status, state) and context accessed from @@ -602,6 +607,11 @@ static int btrfs_free_stale_devices(const char *path, return ret; } +/* + * This is only used on mount, and we are protected from competing things + * messing with our fs_devices by the uuid_mutex, thus we do not need the + * fs_devices->device_list_mutex here. + */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, fmode_t flags, void *holder) @@ -1229,8 +1239,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int ret; lockdep_assert_held(&uuid_mutex); + /* + * The device_list_mutex cannot be taken here in case opening the + * underlying device takes further locks like bd_mutex. + * + * We also don't need the lock here as this is called during mount and + * exclusion is provided by uuid_mutex + */ - mutex_lock(&fs_devices->device_list_mutex); if (fs_devices->opened) { fs_devices->opened++; ret = 0; @@ -1238,7 +1254,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, list_sort(NULL, &fs_devices->devices, devid_cmp); ret = open_fs_devices(fs_devices, flags, holder); } - mutex_unlock(&fs_devices->device_list_mutex); return ret; } From 01d01caf19ff7c537527d352d169c4368375c0a1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jul 2020 15:12:28 -0400 Subject: [PATCH 144/151] btrfs: move the chunk_mutex in btrfs_read_chunk_tree We are currently getting this lockdep splat in btrfs/161: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc5+ #20 Tainted: G E ------------------------------------------------------ mount/678048 is trying to acquire lock: ffff9b769f15b6e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: clone_fs_devices+0x4d/0x170 [btrfs] but task is already holding lock: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x8b/0x8f0 btrfs_init_new_device+0x2d2/0x1240 [btrfs] btrfs_ioctl+0x1de/0x2d20 [btrfs] ksys_ioctl+0x87/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __lock_acquire+0x1240/0x2460 lock_acquire+0xab/0x360 __mutex_lock+0x8b/0x8f0 clone_fs_devices+0x4d/0x170 [btrfs] btrfs_read_chunk_tree+0x330/0x800 [btrfs] open_ctree+0xb7c/0x18ce [btrfs] btrfs_mount_root.cold+0x13/0xfa [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x7de/0xb30 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); *** DEADLOCK *** 3 locks held by mount/678048: #0: ffff9b75ff5fb0e0 (&type->s_umount_key#63/1){+.+.}-{3:3}, at: alloc_super+0xb5/0x380 #1: ffffffffc0c2fbc8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x54/0x800 [btrfs] #2: ffff9b76abdb08d0 (&fs_info->chunk_mutex){+.+.}-{3:3}, at: btrfs_read_chunk_tree+0x6a/0x800 [btrfs] stack backtrace: CPU: 2 PID: 678048 Comm: mount Tainted: G E 5.8.0-rc5+ #20 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./890FX Deluxe5, BIOS P1.40 05/03/2011 Call Trace: dump_stack+0x96/0xd0 check_noncircular+0x162/0x180 __lock_acquire+0x1240/0x2460 ? asm_sysvec_apic_timer_interrupt+0x12/0x20 lock_acquire+0xab/0x360 ? clone_fs_devices+0x4d/0x170 [btrfs] __mutex_lock+0x8b/0x8f0 ? clone_fs_devices+0x4d/0x170 [btrfs] ? rcu_read_lock_sched_held+0x52/0x60 ? cpumask_next+0x16/0x20 ? module_assert_mutex_or_preempt+0x14/0x40 ? __module_address+0x28/0xf0 ? clone_fs_devices+0x4d/0x170 [btrfs] ? static_obj+0x4f/0x60 ? lockdep_init_map_waits+0x43/0x200 ? clone_fs_devices+0x4d/0x170 [btrfs] clone_fs_devices+0x4d/0x170 [btrfs] btrfs_read_chunk_tree+0x330/0x800 [btrfs] open_ctree+0xb7c/0x18ce [btrfs] ? super_setup_bdi_name+0x79/0xd0 btrfs_mount_root.cold+0x13/0xfa [btrfs] ? vfs_parse_fs_string+0x84/0xb0 ? rcu_read_lock_sched_held+0x52/0x60 ? kfree+0x2b5/0x310 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] ? cred_has_capability+0x7c/0x120 ? rcu_read_lock_sched_held+0x52/0x60 ? legacy_get_tree+0x30/0x50 legacy_get_tree+0x30/0x50 vfs_get_tree+0x28/0xc0 do_mount+0x7de/0xb30 ? memdup_user+0x4e/0x90 __x64_sys_mount+0x8e/0xd0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This is because btrfs_read_chunk_tree() can come upon DEV_EXTENT's and then read the device, which takes the device_list_mutex. The device_list_mutex needs to be taken before the chunk_mutex, so this is a problem. We only really need the chunk mutex around adding the chunk, so move the mutex around read_one_chunk. An argument could be made that we don't even need the chunk_mutex here as it's during mount, and we are protected by various other locks. However we already have special rules for ->device_list_mutex, and I'd rather not have another special case for ->chunk_mutex. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 084b8227ea2c..d7670e2a9f39 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7077,7 +7077,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * otherwise we don't need it. */ mutex_lock(&uuid_mutex); - mutex_lock(&fs_info->chunk_mutex); /* * It is possible for mount and umount to race in such a way that @@ -7135,7 +7134,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + mutex_lock(&fs_info->chunk_mutex); ret = read_one_chunk(&found_key, leaf, chunk); + mutex_unlock(&fs_info->chunk_mutex); if (ret) goto error; } @@ -7165,7 +7166,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) } ret = 0; error: - mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&uuid_mutex); btrfs_free_path(path); From ab0db043c35da3477e57d4d516492b2d51a5ca0f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jul 2020 15:12:29 -0400 Subject: [PATCH 145/151] btrfs: fix lockdep splat from btrfs_dump_space_info When running with -o enospc_debug you can get the following splat if one of the dump_space_info's trip ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc5+ #20 Tainted: G OE ------------------------------------------------------ dd/563090 is trying to acquire lock: ffff9e7dbf4f1e18 (&ctl->tree_lock){+.+.}-{2:2}, at: btrfs_dump_free_space+0x2b/0xa0 [btrfs] but task is already holding lock: ffff9e7e2284d428 (&cache->lock){+.+.}-{2:2}, at: btrfs_dump_space_info+0xaa/0x120 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&cache->lock){+.+.}-{2:2}: _raw_spin_lock+0x25/0x30 btrfs_add_reserved_bytes+0x3c/0x3c0 [btrfs] find_free_extent+0x7ef/0x13b0 [btrfs] btrfs_reserve_extent+0x9b/0x180 [btrfs] btrfs_alloc_tree_block+0xc1/0x340 [btrfs] alloc_tree_block_no_bg_flush+0x4a/0x60 [btrfs] __btrfs_cow_block+0x122/0x530 [btrfs] btrfs_cow_block+0x106/0x210 [btrfs] commit_cowonly_roots+0x55/0x300 [btrfs] btrfs_commit_transaction+0x4ed/0xac0 [btrfs] sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x36/0x70 cleanup_mnt+0x104/0x160 task_work_run+0x5f/0x90 __prepare_exit_to_usermode+0x1bd/0x1c0 do_syscall_64+0x5e/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&space_info->lock){+.+.}-{2:2}: _raw_spin_lock+0x25/0x30 btrfs_block_rsv_release+0x1a6/0x3f0 [btrfs] btrfs_inode_rsv_release+0x4f/0x170 [btrfs] btrfs_clear_delalloc_extent+0x155/0x480 [btrfs] clear_state_bit+0x81/0x1a0 [btrfs] __clear_extent_bit+0x25c/0x5d0 [btrfs] clear_extent_bit+0x15/0x20 [btrfs] btrfs_invalidatepage+0x2b7/0x3c0 [btrfs] truncate_cleanup_page+0x47/0xe0 truncate_inode_pages_range+0x238/0x840 truncate_pagecache+0x44/0x60 btrfs_setattr+0x202/0x5e0 [btrfs] notify_change+0x33b/0x490 do_truncate+0x76/0xd0 path_openat+0x687/0xa10 do_filp_open+0x91/0x100 do_sys_openat2+0x215/0x2d0 do_sys_open+0x44/0x80 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&tree->lock#2){+.+.}-{2:2}: _raw_spin_lock+0x25/0x30 find_first_extent_bit+0x32/0x150 [btrfs] write_pinned_extent_entries.isra.0+0xc5/0x100 [btrfs] __btrfs_write_out_cache+0x172/0x480 [btrfs] btrfs_write_out_cache+0x7a/0xf0 [btrfs] btrfs_write_dirty_block_groups+0x286/0x3b0 [btrfs] commit_cowonly_roots+0x245/0x300 [btrfs] btrfs_commit_transaction+0x4ed/0xac0 [btrfs] close_ctree+0xf9/0x2f5 [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x36/0x70 cleanup_mnt+0x104/0x160 task_work_run+0x5f/0x90 __prepare_exit_to_usermode+0x1bd/0x1c0 do_syscall_64+0x5e/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&ctl->tree_lock){+.+.}-{2:2}: __lock_acquire+0x1240/0x2460 lock_acquire+0xab/0x360 _raw_spin_lock+0x25/0x30 btrfs_dump_free_space+0x2b/0xa0 [btrfs] btrfs_dump_space_info+0xf4/0x120 [btrfs] btrfs_reserve_extent+0x176/0x180 [btrfs] __btrfs_prealloc_file_range+0x145/0x550 [btrfs] cache_save_setup+0x28d/0x3b0 [btrfs] btrfs_start_dirty_block_groups+0x1fc/0x4f0 [btrfs] btrfs_commit_transaction+0xcc/0xac0 [btrfs] btrfs_alloc_data_chunk_ondemand+0x162/0x4c0 [btrfs] btrfs_check_data_free_space+0x4c/0xa0 [btrfs] btrfs_buffered_write.isra.0+0x19b/0x740 [btrfs] btrfs_file_write_iter+0x3cf/0x610 [btrfs] new_sync_write+0x11e/0x1b0 vfs_write+0x1c9/0x200 ksys_write+0x68/0xe0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Chain exists of: &ctl->tree_lock --> &space_info->lock --> &cache->lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&cache->lock); lock(&space_info->lock); lock(&cache->lock); lock(&ctl->tree_lock); *** DEADLOCK *** 6 locks held by dd/563090: #0: ffff9e7e21d18448 (sb_writers#14){.+.+}-{0:0}, at: vfs_write+0x195/0x200 #1: ffff9e7dd0410ed8 (&sb->s_type->i_mutex_key#19){++++}-{3:3}, at: btrfs_file_write_iter+0x86/0x610 [btrfs] #2: ffff9e7e21d18638 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40b/0x5b0 [btrfs] #3: ffff9e7e1f05d688 (&cur_trans->cache_write_mutex){+.+.}-{3:3}, at: btrfs_start_dirty_block_groups+0x158/0x4f0 [btrfs] #4: ffff9e7e2284ddb8 (&space_info->groups_sem){++++}-{3:3}, at: btrfs_dump_space_info+0x69/0x120 [btrfs] #5: ffff9e7e2284d428 (&cache->lock){+.+.}-{2:2}, at: btrfs_dump_space_info+0xaa/0x120 [btrfs] stack backtrace: CPU: 3 PID: 563090 Comm: dd Tainted: G OE 5.8.0-rc5+ #20 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./890FX Deluxe5, BIOS P1.40 05/03/2011 Call Trace: dump_stack+0x96/0xd0 check_noncircular+0x162/0x180 __lock_acquire+0x1240/0x2460 ? wake_up_klogd.part.0+0x30/0x40 lock_acquire+0xab/0x360 ? btrfs_dump_free_space+0x2b/0xa0 [btrfs] _raw_spin_lock+0x25/0x30 ? btrfs_dump_free_space+0x2b/0xa0 [btrfs] btrfs_dump_free_space+0x2b/0xa0 [btrfs] btrfs_dump_space_info+0xf4/0x120 [btrfs] btrfs_reserve_extent+0x176/0x180 [btrfs] __btrfs_prealloc_file_range+0x145/0x550 [btrfs] ? btrfs_qgroup_reserve_data+0x1d/0x60 [btrfs] cache_save_setup+0x28d/0x3b0 [btrfs] btrfs_start_dirty_block_groups+0x1fc/0x4f0 [btrfs] btrfs_commit_transaction+0xcc/0xac0 [btrfs] ? start_transaction+0xe0/0x5b0 [btrfs] btrfs_alloc_data_chunk_ondemand+0x162/0x4c0 [btrfs] btrfs_check_data_free_space+0x4c/0xa0 [btrfs] btrfs_buffered_write.isra.0+0x19b/0x740 [btrfs] ? ktime_get_coarse_real_ts64+0xa8/0xd0 ? trace_hardirqs_on+0x1c/0xe0 btrfs_file_write_iter+0x3cf/0x610 [btrfs] new_sync_write+0x11e/0x1b0 vfs_write+0x1c9/0x200 ksys_write+0x68/0xe0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This is because we're holding the block_group->lock while trying to dump the free space cache. However we don't need this lock, we just need it to read the values for the printk, so move the free space cache dumping outside of the block group lock. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c7bd3fdd7792..475968ccbd1d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -468,8 +468,8 @@ again: "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", cache->start, cache->length, cache->used, cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); + btrfs_dump_free_space(cache, bytes); } if (++index < BTRFS_NR_RAID_TYPES) goto again; From 349e120ecebeb984376c8edb89bf0bfb85bc16f7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:48:45 -0400 Subject: [PATCH 146/151] btrfs: don't adjust bg flags and use default allocation profiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit btrfs/061 has been failing consistently for me recently with a transaction abort. We run out of space in the system chunk array, which means we've allocated way too many system chunks than we need. Chris added this a long time ago for balance as a poor mans restriping. If you had a single disk and then added another disk and then did a balance, update_block_group_flags would then figure out which RAID level you needed. Fast forward to today and we have restriping behavior, so we can explicitly tell the fs that we're trying to change the raid level. This is accomplished through the normal get_alloc_profile path. Furthermore this code actually causes btrfs/061 to fail, because we do things like mkfs -m dup -d single with multiple devices. This trips this check alloc_flags = update_block_group_flags(fs_info, cache->flags); if (alloc_flags != cache->flags) { ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); in btrfs_inc_block_group_ro. Because we're balancing and scrubbing, but not actually restriping, we keep forcing chunk allocation of RAID1 chunks. This eventually causes us to run out of system space and the file system aborts and flips read only. We don't need this poor mans restriping any more, simply use the normal get_alloc_profile helper, which will get the correct alloc_flags and thus make the right decision for chunk allocation. This keeps us from allocating a billion system chunks and falling over. Tested-by: Holger Hoffstätte Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 52 ++---------------------------------------- 1 file changed, 2 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 884de28a41e4..652b35d5a773 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2194,54 +2194,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, return 0; } -static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) -{ - u64 num_devices; - u64 stripped; - - /* - * if restripe for this chunk_type is on pick target profile and - * return, otherwise do the usual balance - */ - stripped = get_restripe_target(fs_info, flags); - if (stripped) - return extended_to_chunk(stripped); - - num_devices = fs_info->fs_devices->rw_devices; - - stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | - BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; - - if (num_devices == 1) { - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* turn raid0 into single device chunks */ - if (flags & BTRFS_BLOCK_GROUP_RAID0) - return stripped; - - /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID10)) - return stripped | BTRFS_BLOCK_GROUP_DUP; - } else { - /* they already had raid on here, just return */ - if (flags & stripped) - return flags; - - stripped |= BTRFS_BLOCK_GROUP_DUP; - stripped = flags & ~stripped; - - /* switch duplicated blocks with raid1 */ - if (flags & BTRFS_BLOCK_GROUP_DUP) - return stripped | BTRFS_BLOCK_GROUP_RAID1; - - /* this is drive concat, leave it alone */ - } - - return flags; -} - /* * Mark one block group RO, can be called several times for the same block * group. @@ -2287,7 +2239,7 @@ again: * If we are changing raid levels, try to allocate a * corresponding block group with the new raid level. */ - alloc_flags = update_block_group_flags(fs_info, cache->flags); + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); @@ -2314,7 +2266,7 @@ again: ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = update_block_group_flags(fs_info, cache->flags); + alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); mutex_lock(&fs_info->chunk_mutex); check_system_chunk(trans, alloc_flags); mutex_unlock(&fs_info->chunk_mutex); From 162e0a16b7d93629c8325ea32d99b38d17bc03eb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:48:46 -0400 Subject: [PATCH 147/151] btrfs: if we're restriping, use the target restripe profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously we depended on some weird behavior in our chunk allocator to force the allocation of new stripes, so by the time we got to doing the reduce we would usually already have a chunk with the proper target. However that behavior causes other problems and needs to be removed. First however we need to remove this check to only restripe if we already have those available profiles, because if we're allocating our first chunk it obviously will not be available. Simply use the target as specified, and if that fails it'll be because we're out of space. Tested-by: Holger Hoffstätte Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 652b35d5a773..613920c17ac1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -65,11 +65,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) spin_lock(&fs_info->balance_lock); target = get_restripe_target(fs_info, flags); if (target) { - /* Pick target profile only if it's already available */ - if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&fs_info->balance_lock); - return extended_to_chunk(target); - } + spin_unlock(&fs_info->balance_lock); + return extended_to_chunk(target); } spin_unlock(&fs_info->balance_lock); From 88c4703f00a9e8ab60133690cfdaccea6c07e560 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 23 Jul 2020 00:18:04 +0900 Subject: [PATCH 148/151] btrfs: open-code remount flag setting in btrfs_remount When we're (re)mounting a btrfs filesystem we set the BTRFS_FS_STATE_REMOUNTING state in fs_info to serialize against async reclaim or defrags. This flag is set in btrfs_remount_prepare() called by btrfs_remount(). As btrfs_remount_prepare() does nothing but setting this flag and doesn't have a second caller, we can just open-code the flag setting in btrfs_remount(). Similarly do for so clearing of the flag by moving it out of btrfs_remount_cleanup() into btrfs_remount() to be symmetrical. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4f0bb29b8d6..5a9dc31d95c9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1782,11 +1782,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } -static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) -{ - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); -} - static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, unsigned long old_opts, int flags) { @@ -1820,8 +1815,6 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && !btrfs_test_opt(fs_info, DISCARD_ASYNC)) btrfs_discard_cleanup(fs_info); - - clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) @@ -1837,7 +1830,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) int ret; sync_filesystem(sb); - btrfs_remount_prepare(fs_info); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); if (data) { void *new_sec_opts = NULL; @@ -1959,6 +1952,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) out: wake_up_process(fs_info->transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + return 0; restore: @@ -1973,6 +1968,8 @@ restore: old_thread_pool_size, fs_info->thread_pool_size); fs_info->metadata_ratio = old_metadata_ratio; btrfs_remount_cleanup(fs_info, old_opts); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + return ret; } From 3d6448e631591756da36efb3ea6355ff6f383c3a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Jul 2020 12:28:37 +0100 Subject: [PATCH 149/151] btrfs: fix race between page release and a fast fsync When releasing an extent map, done through the page release callback, we can race with an ongoing fast fsync and cause the fsync to miss a new extent and not log it. The steps for this to happen are the following: 1) A page is dirtied for some inode I; 2) Writeback for that page is triggered by a path other than fsync, for example by the system due to memory pressure; 3) When the ordered extent for the extent (a single 4K page) finishes, we unpin the corresponding extent map and set its generation to N, the current transaction's generation; 4) The btrfs_releasepage() callback is invoked by the system due to memory pressure for that no longer dirty page of inode I; 5) At the same time, some task calls fsync on inode I, joins transaction N, and at btrfs_log_inode() it sees that the inode does not have the full sync flag set, so we proceed with a fast fsync. But before we get into btrfs_log_changed_extents() and lock the inode's extent map tree: 6) Through btrfs_releasepage() we end up at try_release_extent_mapping() and we remove the extent map for the new 4Kb extent, because it is neither pinned anymore nor locked. By calling remove_extent_mapping(), we remove the extent map from the list of modified extents, since the extent map does not have the logging flag set. We unlock the inode's extent map tree; 7) The task doing the fast fsync now enters btrfs_log_changed_extents(), locks the inode's extent map tree and iterates its list of modified extents, which no longer has the 4Kb extent in it, so it does not log the extent; 8) The fsync finishes; 9) Before transaction N is committed, a power failure happens. After replaying the log, the 4K extent of inode I will be missing, since it was not logged due to the race with try_release_extent_mapping(). So fix this by teaching try_release_extent_mapping() to not remove an extent map if it's still in the list of modified extents. Fixes: ff44c6e36dc9dc ("Btrfs: do not hold the write_lock on the extent tree while logging") CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3fbc37692592..c049a33ed0df 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4494,15 +4494,25 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) free_extent_map(em); break; } - if (!test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) { + if (test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED, 0, NULL)) + goto next; + /* + * If it's not in the list of modified extents, used + * by a fast fsync, we can remove it. If it's being + * logged we can safely remove it since fsync took an + * extra reference on the em. + */ + if (list_empty(&em->list) || + test_bit(EXTENT_FLAG_LOGGING, &em->flags)) { set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &btrfs_inode->runtime_flags); remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); } +next: start = extent_map_end(em); write_unlock(&map->lock); From fbc2bd7e7ab95704a96dade6a7d00ea99a68b015 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Jul 2020 12:28:52 +0100 Subject: [PATCH 150/151] btrfs: release old extent maps during page release When removing an extent map at try_release_extent_mapping(), called through the page release callback (btrfs_releasepage()), we never release an extent map that is in the list of modified extents. This is to prevent races with a concurrent fsync using the fast path, which could lead to not logging an extent created in the current transaction. However we can safely remove an extent map created in a past transaction that is still in the list of modified extents (because no one fsynced yet the inode after that transaction got commited), because such extents are skipped during an fsync as it is pointless to log them. This change does that. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c049a33ed0df..d619babede0f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4481,6 +4481,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { + struct btrfs_fs_info *fs_info; + u64 cur_gen; + len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); @@ -4505,13 +4508,27 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) * extra reference on the em. */ if (list_empty(&em->list) || - test_bit(EXTENT_FLAG_LOGGING, &em->flags)) { - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &btrfs_inode->runtime_flags); - remove_extent_mapping(map, em); - /* once for the rb tree */ - free_extent_map(em); - } + test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + goto remove_em; + /* + * If it's in the list of modified extents, remove it + * only if its generation is older then the current one, + * in which case we don't need it for a fast fsync. + * Otherwise don't remove it, we could be racing with an + * ongoing fast fsync that could miss the new extent. + */ + fs_info = btrfs_inode->root->fs_info; + spin_lock(&fs_info->trans_lock); + cur_gen = fs_info->generation; + spin_unlock(&fs_info->trans_lock); + if (em->generation >= cur_gen) + goto next; +remove_em: + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &btrfs_inode->runtime_flags); + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); next: start = extent_map_end(em); write_unlock(&map->lock); From 5e548b32018d96c377fda4bdac2bf511a448ca67 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Jul 2020 12:29:01 +0100 Subject: [PATCH 151/151] btrfs: do not set the full sync flag on the inode during page release When removing an extent map at try_release_extent_mapping(), called through the page release callback (btrfs_releasepage()), we always set the full sync flag on the inode, which forces the next fsync to use a slower code path. This hurts performance for workloads that dirty an amount of data that exceeds or is very close to the system's RAM memory and do frequent fsync operations (like database servers can for example). In particular if there are concurrent fsyncs against different files, by falling back to a full fsync we do a lot more checksum lookups in the checksums btree, as we do it for all the extents created in the current transaction, instead of only the new ones since the last fsync. These checksums lookups not only take some time but, more importantly, they also cause contention on the checksums btree locks due to the concurrency with checksum insertions in the btree by ordered extents from other inodes. We actually don't need to set the full sync flag on the inode, because we only remove extent maps that are in the list of modified extents if they were created in a past transaction, in which case an fsync skips them as it's pointless to log them. So stop setting the full fsync flag on the inode whenever we remove an extent map. This patch is part of a patchset that consists of 3 patches, which have the following subjects: 1/3 btrfs: fix race between page release and a fast fsync 2/3 btrfs: release old extent maps during page release 3/3 btrfs: do not set the full sync flag on the inode during page release Performance tests were ran against a branch (misc-next) containing the whole patchset. The test exercises a workload where there are multiple processes writing to files and fsyncing them (each writing and fsyncing its own file), and in total the amount of data dirtied ranges from 2x to 4x the system's RAM memory (16GiB), so that the page release callback is invoked frequently. The following script, using fio, was used to perform the tests: $ cat test-fsync.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-d single -m single" if [ $# -ne 3 ]; then echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ" exit 1 fi NUM_JOBS=$1 FILE_SIZE=$2 FSYNC_FREQ=$3 cat < /tmp/fio-job.ini [writers] rw=write fsync=$FSYNC_FREQ fallocate=none group_reporting=1 direct=0 bs=64k ioengine=sync size=$FILE_SIZE directory=$MNT numjobs=$NUM_JOBS thread EOF echo "Using config:" echo cat /tmp/fio-job.ini echo mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT The tests were performed for different numbers of jobs, file sizes and fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has 12 cores, with cpu governance set to performance mode on all cores), 16GiB of ram (the host has 64GiB) and using a NVMe device directly (without an intermediary filesystem in the host). While running the tests, the host was not used for anything else, to avoid disturbing the tests. The obtained results were the following, and the last line printed by fio is pasted (includes aggregated throughput and test run time). ***************************************************** **** 1 job, 32GiB file, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=29.1MiB/s (30.5MB/s), 29.1MiB/s-29.1MiB/s (30.5MB/s-30.5MB/s), io=32.0GiB (34.4GB), run=1127557-1127557msec After patchset: WRITE: bw=29.3MiB/s (30.7MB/s), 29.3MiB/s-29.3MiB/s (30.7MB/s-30.7MB/s), io=32.0GiB (34.4GB), run=1119042-1119042msec (+0.7% throughput, -0.8% run time) ***************************************************** **** 2 jobs, 16GiB files, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=33.5MiB/s (35.1MB/s), 33.5MiB/s-33.5MiB/s (35.1MB/s-35.1MB/s), io=32.0GiB (34.4GB), run=979000-979000msec After patchset: WRITE: bw=39.9MiB/s (41.8MB/s), 39.9MiB/s-39.9MiB/s (41.8MB/s-41.8MB/s), io=32.0GiB (34.4GB), run=821283-821283msec (+19.1% throughput, -16.1% runtime) ***************************************************** **** 4 jobs, 8GiB files, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=52.1MiB/s (54.6MB/s), 52.1MiB/s-52.1MiB/s (54.6MB/s-54.6MB/s), io=32.0GiB (34.4GB), run=629130-629130msec After patchset: WRITE: bw=71.8MiB/s (75.3MB/s), 71.8MiB/s-71.8MiB/s (75.3MB/s-75.3MB/s), io=32.0GiB (34.4GB), run=456357-456357msec (+37.8% throughput, -27.5% runtime) ***************************************************** **** 8 jobs, 4GiB files, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430708-430708msec After patchset: WRITE: bw=133MiB/s (140MB/s), 133MiB/s-133MiB/s (140MB/s-140MB/s), io=32.0GiB (34.4GB), run=245458-245458msec (+74.7% throughput, -43.0% run time) ***************************************************** **** 16 jobs, 2GiB files, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=74.7MiB/s (78.3MB/s), 74.7MiB/s-74.7MiB/s (78.3MB/s-78.3MB/s), io=32.0GiB (34.4GB), run=438625-438625msec After patchset: WRITE: bw=184MiB/s (193MB/s), 184MiB/s-184MiB/s (193MB/s-193MB/s), io=32.0GiB (34.4GB), run=177864-177864msec (+146.3% throughput, -59.5% run time) ***************************************************** **** 32 jobs, 2GiB files, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=72.6MiB/s (76.1MB/s), 72.6MiB/s-72.6MiB/s (76.1MB/s-76.1MB/s), io=64.0GiB (68.7GB), run=902615-902615msec After patchset: WRITE: bw=227MiB/s (238MB/s), 227MiB/s-227MiB/s (238MB/s-238MB/s), io=64.0GiB (68.7GB), run=288936-288936msec (+212.7% throughput, -68.0% run time) ***************************************************** **** 64 jobs, 1GiB files, fsync frequency 1 **** ***************************************************** Before patchset: WRITE: bw=98.8MiB/s (104MB/s), 98.8MiB/s-98.8MiB/s (104MB/s-104MB/s), io=64.0GiB (68.7GB), run=663126-663126msec After patchset: WRITE: bw=294MiB/s (308MB/s), 294MiB/s-294MiB/s (308MB/s-308MB/s), io=64.0GiB (68.7GB), run=222940-222940msec (+197.6% throughput, -66.4% run time) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d619babede0f..617ea38e6fd7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4524,8 +4524,14 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) if (em->generation >= cur_gen) goto next; remove_em: - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &btrfs_inode->runtime_flags); + /* + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there + * is no need to set the full fsync flag on the inode (it + * hurts the fsync performance for workloads with a data + * size that exceeds or is close to the system's memory). + */ remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em);