mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-23 20:24:12 +08:00
for-6.8-rc4-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmXMewsACgkQxWXV+ddt WDtFUBAAkEU/hxB4YsLn2JEdp3wc80w5/qKkPaYHsI2ncvc3RFiG+tqSY7BakMgE Kkdl8ouNX3p/S62ykIBQTKZnOTk7FgKlClAQtgKn1afexqABsP2mifnh40Dzf7eA VvEl7chnRT6oeivtQkB+BtgOzaOUp4j/8oAivRN8NKNwTxojV4g9PErKSOWfVQSq 3zlrLJbe6era43SpnexkjZHn4Fy4CN+C7FMm+pT/yKzZi2oBZs9BvNZGhIkdnzcK MftrY9dSGO3CDD2Kvrz3lEm7ZB83wCpm+GTDN7iJx2y+yeW+aHjshFkJr1ApEZQa lsWTnj3hk3yHoOPUuLlchw5JcFb/dFZ1Ztdwkunf8nmt5a3O/5Zf+Csgze8c+Iii MJQKi0B/bNQ7cSEwRt36s75kROBItZmHCZmSBlOpT1LXSDQMJ9lvEnv/fPQdcHHF WMEmk5O5IoGYv5kx5wIoWv27HKE/bDwH6RjkxEd/n17XP+PcfHY4K0o0CGtfwS8g hdy9RI9X8dbf3ZPrxtsgQ2T8btWs68A4S6nwcSuY5HK0WNmvRh47eLfCI6S6XGJs hHkppLcc+WTXOskCA+ABdm9hgeAPZkCSpuQSmC2HBt8gRv8XqO7z4cZ/up2N+tES ZOJSrJb97nusOcxY0pLexnD6eI3pQxzGMiPONlC1Re8CdjZ0l+4= =RRGT -----END PGP SIGNATURE----- Merge tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: "A few regular fixes and one fix for space reservation regression since 6.7 that users have been reporting: - fix over-reservation of metadata chunks due to not keeping proper balance between global block reserve and delayed refs reserve; in practice this leaves behind empty metadata block groups, the workaround is to reclaim them by using the '-musage=1' balance filter - other space reservation fixes: - do not delete unused block group if it may be used soon - do not reserve space for checksums for NOCOW files - fix extent map assertion failure when writing out free space inode - reject encoded write if inode has nodatasum flag set - fix chunk map leak when loading block group zone info" * tag 'for-6.8-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: don't refill whole delayed refs block reserve when starting transaction btrfs: zoned: fix chunk map leak when loading block group zone info btrfs: reject encoded write if inode has nodatasum flag set btrfs: don't reserve space for checksums when writing to nocow files btrfs: add new unused block groups to the list of unused block groups btrfs: do not delete unused block group if it may be used soon btrfs: add and use helper to check if block group is used btrfs: don't drop extent_map for free space inode on write error
This commit is contained in:
commit
1f3a3e2aae
@ -1455,6 +1455,7 @@ out:
|
||||
*/
|
||||
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
LIST_HEAD(retry_list);
|
||||
struct btrfs_block_group *block_group;
|
||||
struct btrfs_space_info *space_info;
|
||||
struct btrfs_trans_handle *trans;
|
||||
@ -1476,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
||||
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
while (!list_empty(&fs_info->unused_bgs)) {
|
||||
u64 used;
|
||||
int trimming;
|
||||
|
||||
block_group = list_first_entry(&fs_info->unused_bgs,
|
||||
@ -1511,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
||||
goto next;
|
||||
}
|
||||
|
||||
spin_lock(&space_info->lock);
|
||||
spin_lock(&block_group->lock);
|
||||
if (block_group->reserved || block_group->pinned ||
|
||||
block_group->used || block_group->ro ||
|
||||
if (btrfs_is_block_group_used(block_group) || block_group->ro ||
|
||||
list_is_singular(&block_group->list)) {
|
||||
/*
|
||||
* We want to bail if we made new allocations or have
|
||||
@ -1523,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
||||
*/
|
||||
trace_btrfs_skip_unused_block_group(block_group);
|
||||
spin_unlock(&block_group->lock);
|
||||
spin_unlock(&space_info->lock);
|
||||
up_write(&space_info->groups_sem);
|
||||
goto next;
|
||||
}
|
||||
|
||||
/*
|
||||
* The block group may be unused but there may be space reserved
|
||||
* accounting with the existence of that block group, that is,
|
||||
* space_info->bytes_may_use was incremented by a task but no
|
||||
* space was yet allocated from the block group by the task.
|
||||
* That space may or may not be allocated, as we are generally
|
||||
* pessimistic about space reservation for metadata as well as
|
||||
* for data when using compression (as we reserve space based on
|
||||
* the worst case, when data can't be compressed, and before
|
||||
* actually attempting compression, before starting writeback).
|
||||
*
|
||||
* So check if the total space of the space_info minus the size
|
||||
* of this block group is less than the used space of the
|
||||
* space_info - if that's the case, then it means we have tasks
|
||||
* that might be relying on the block group in order to allocate
|
||||
* extents, and add back the block group to the unused list when
|
||||
* we finish, so that we retry later in case no tasks ended up
|
||||
* needing to allocate extents from the block group.
|
||||
*/
|
||||
used = btrfs_space_info_used(space_info, true);
|
||||
if (space_info->total_bytes - block_group->length < used) {
|
||||
/*
|
||||
* Add a reference for the list, compensate for the ref
|
||||
* drop under the "next" label for the
|
||||
* fs_info->unused_bgs list.
|
||||
*/
|
||||
btrfs_get_block_group(block_group);
|
||||
list_add_tail(&block_group->bg_list, &retry_list);
|
||||
|
||||
trace_btrfs_skip_unused_block_group(block_group);
|
||||
spin_unlock(&block_group->lock);
|
||||
spin_unlock(&space_info->lock);
|
||||
up_write(&space_info->groups_sem);
|
||||
goto next;
|
||||
}
|
||||
|
||||
spin_unlock(&block_group->lock);
|
||||
spin_unlock(&space_info->lock);
|
||||
|
||||
/* We don't want to force the issue, only flip if it's ok. */
|
||||
ret = inc_block_group_ro(block_group, 0);
|
||||
@ -1650,12 +1691,16 @@ next:
|
||||
btrfs_put_block_group(block_group);
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
}
|
||||
list_splice_tail(&retry_list, &fs_info->unused_bgs);
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
return;
|
||||
|
||||
flip_async:
|
||||
btrfs_end_transaction(trans);
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
list_splice_tail(&retry_list, &fs_info->unused_bgs);
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
btrfs_put_block_group(block_group);
|
||||
btrfs_discard_punt_unused_bgs_list(fs_info);
|
||||
@ -2684,6 +2729,37 @@ next:
|
||||
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
|
||||
list_del_init(&block_group->bg_list);
|
||||
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
|
||||
|
||||
/*
|
||||
* If the block group is still unused, add it to the list of
|
||||
* unused block groups. The block group may have been created in
|
||||
* order to satisfy a space reservation, in which case the
|
||||
* extent allocation only happens later. But often we don't
|
||||
* actually need to allocate space that we previously reserved,
|
||||
* so the block group may become unused for a long time. For
|
||||
* example for metadata we generally reserve space for a worst
|
||||
* possible scenario, but then don't end up allocating all that
|
||||
* space or none at all (due to no need to COW, extent buffers
|
||||
* were already COWed in the current transaction and still
|
||||
* unwritten, tree heights lower than the maximum possible
|
||||
* height, etc). For data we generally reserve the axact amount
|
||||
* of space we are going to allocate later, the exception is
|
||||
* when using compression, as we must reserve space based on the
|
||||
* uncompressed data size, because the compression is only done
|
||||
* when writeback triggered and we don't know how much space we
|
||||
* are actually going to need, so we reserve the uncompressed
|
||||
* size because the data may be uncompressible in the worst case.
|
||||
*/
|
||||
if (ret == 0) {
|
||||
bool used;
|
||||
|
||||
spin_lock(&block_group->lock);
|
||||
used = btrfs_is_block_group_used(block_group);
|
||||
spin_unlock(&block_group->lock);
|
||||
|
||||
if (!used)
|
||||
btrfs_mark_bg_unused(block_group);
|
||||
}
|
||||
}
|
||||
btrfs_trans_release_chunk_metadata(trans);
|
||||
}
|
||||
|
@ -257,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
|
||||
return (block_group->start + block_group->length);
|
||||
}
|
||||
|
||||
static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
|
||||
{
|
||||
lockdep_assert_held(&bg->lock);
|
||||
|
||||
return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
|
||||
}
|
||||
|
||||
static inline bool btrfs_is_block_group_data_only(
|
||||
struct btrfs_block_group *block_group)
|
||||
{
|
||||
|
@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
|
||||
u64 reserve_size = 0;
|
||||
u64 qgroup_rsv_size = 0;
|
||||
u64 csum_leaves;
|
||||
unsigned outstanding_extents;
|
||||
|
||||
lockdep_assert_held(&inode->lock);
|
||||
@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
|
||||
outstanding_extents);
|
||||
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
|
||||
}
|
||||
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
|
||||
inode->csum_bytes);
|
||||
reserve_size += btrfs_calc_insert_metadata_size(fs_info,
|
||||
csum_leaves);
|
||||
if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
|
||||
u64 csum_leaves;
|
||||
|
||||
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
|
||||
reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
|
||||
}
|
||||
/*
|
||||
* For qgroup rsv, the calculation is very simple:
|
||||
* account one nodesize for each outstanding extent
|
||||
@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
|
||||
spin_unlock(&block_rsv->lock);
|
||||
}
|
||||
|
||||
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
|
||||
static void calc_inode_reservations(struct btrfs_inode *inode,
|
||||
u64 num_bytes, u64 disk_num_bytes,
|
||||
u64 *meta_reserve, u64 *qgroup_reserve)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
u64 nr_extents = count_max_extents(fs_info, num_bytes);
|
||||
u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
|
||||
u64 csum_leaves;
|
||||
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
|
||||
|
||||
if (inode->flags & BTRFS_INODE_NODATASUM)
|
||||
csum_leaves = 0;
|
||||
else
|
||||
csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
|
||||
|
||||
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
|
||||
nr_extents + csum_leaves);
|
||||
|
||||
@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
* everything out and try again, which is bad. This way we just
|
||||
* over-reserve slightly, and clean up the mess when we are done.
|
||||
*/
|
||||
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
|
||||
calc_inode_reservations(inode, num_bytes, disk_num_bytes,
|
||||
&meta_reserve, &qgroup_reserve);
|
||||
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
|
||||
noflush);
|
||||
@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
nr_extents = count_max_extents(fs_info, num_bytes);
|
||||
spin_lock(&inode->lock);
|
||||
btrfs_mod_outstanding_extents(inode, nr_extents);
|
||||
inode->csum_bytes += disk_num_bytes;
|
||||
if (!(inode->flags & BTRFS_INODE_NODATASUM))
|
||||
inode->csum_bytes += disk_num_bytes;
|
||||
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
|
||||
spin_unlock(&inode->lock);
|
||||
|
||||
@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
|
||||
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
|
||||
spin_lock(&inode->lock);
|
||||
inode->csum_bytes -= num_bytes;
|
||||
if (!(inode->flags & BTRFS_INODE_NODATASUM))
|
||||
inode->csum_bytes -= num_bytes;
|
||||
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
|
||||
spin_unlock(&inode->lock);
|
||||
|
||||
|
@ -3184,8 +3184,23 @@ out:
|
||||
unwritten_start += logical_len;
|
||||
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
|
||||
|
||||
/* Drop extent maps for the part of the extent we didn't write. */
|
||||
btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
|
||||
/*
|
||||
* Drop extent maps for the part of the extent we didn't write.
|
||||
*
|
||||
* We have an exception here for the free_space_inode, this is
|
||||
* because when we do btrfs_get_extent() on the free space inode
|
||||
* we will search the commit root. If this is a new block group
|
||||
* we won't find anything, and we will trip over the assert in
|
||||
* writepage where we do ASSERT(em->block_start !=
|
||||
* EXTENT_MAP_HOLE).
|
||||
*
|
||||
* Theoretically we could also skip this for any NOCOW extent as
|
||||
* we don't mess with the extent map tree in the NOCOW case, but
|
||||
* for now simply skip this if we are the free space inode.
|
||||
*/
|
||||
if (!btrfs_is_free_space_inode(inode))
|
||||
btrfs_drop_extent_map_range(inode, unwritten_start,
|
||||
end, false);
|
||||
|
||||
/*
|
||||
* If the ordered extent had an IOERR or something else went
|
||||
@ -10273,6 +10288,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
|
||||
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Compressed extents should always have checksums, so error out if we
|
||||
* have a NOCOW file or inode was created while mounted with NODATASUM.
|
||||
*/
|
||||
if (inode->flags & BTRFS_INODE_NODATASUM)
|
||||
return -EINVAL;
|
||||
|
||||
orig_count = iov_iter_count(from);
|
||||
|
||||
/* The extent size must be sane. */
|
||||
|
@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
|
||||
u64 num_bytes,
|
||||
u64 *delayed_refs_bytes)
|
||||
{
|
||||
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
|
||||
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
|
||||
u64 extra_delayed_refs_bytes = 0;
|
||||
u64 bytes;
|
||||
u64 bytes = num_bytes + *delayed_refs_bytes;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* If there's a gap between the size of the delayed refs reserve and
|
||||
* its reserved space, than some tasks have added delayed refs or bumped
|
||||
* its size otherwise (due to block group creation or removal, or block
|
||||
* group item update). Also try to allocate that gap in order to prevent
|
||||
* using (and possibly abusing) the global reserve when committing the
|
||||
* transaction.
|
||||
*/
|
||||
if (flush == BTRFS_RESERVE_FLUSH_ALL &&
|
||||
!btrfs_block_rsv_full(delayed_refs_rsv)) {
|
||||
spin_lock(&delayed_refs_rsv->lock);
|
||||
if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
|
||||
extra_delayed_refs_bytes = delayed_refs_rsv->size -
|
||||
delayed_refs_rsv->reserved;
|
||||
spin_unlock(&delayed_refs_rsv->lock);
|
||||
}
|
||||
|
||||
bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
|
||||
|
||||
/*
|
||||
* We want to reserve all the bytes we may need all at once, so we only
|
||||
* do 1 enospc flushing cycle per transaction start.
|
||||
*/
|
||||
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
|
||||
if (ret == 0) {
|
||||
if (extra_delayed_refs_bytes > 0)
|
||||
btrfs_migrate_to_delayed_refs_rsv(fs_info,
|
||||
extra_delayed_refs_bytes);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (extra_delayed_refs_bytes > 0) {
|
||||
bytes -= extra_delayed_refs_bytes;
|
||||
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
|
||||
if (ret == 0)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are an emergency flush, which can steal from the global block
|
||||
* reserve, then attempt to not reserve space for the delayed refs, as
|
||||
* we will consume space for them from the global block reserve.
|
||||
*/
|
||||
if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
|
||||
if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
|
||||
bytes -= *delayed_refs_bytes;
|
||||
*delayed_refs_bytes = 0;
|
||||
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
|
||||
|
@ -1670,6 +1670,7 @@ out:
|
||||
}
|
||||
bitmap_free(active);
|
||||
kfree(zone_info);
|
||||
btrfs_free_chunk_map(map);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user