linux/fs/btrfs/extent-tree.h

168 lines
5.3 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
#include <linux/types.h>
#include "misc.h"
#include "block-group.h"
#include "locking.h"
struct extent_buffer;
struct btrfs_free_cluster;
struct btrfs_fs_info;
struct btrfs_root;
struct btrfs_path;
struct btrfs_ref;
struct btrfs_disk_key;
struct btrfs_delayed_ref_head;
struct btrfs_delayed_ref_root;
struct btrfs_extent_inline_ref;
enum btrfs_extent_allocation_policy {
BTRFS_EXTENT_ALLOC_CLUSTERED,
BTRFS_EXTENT_ALLOC_ZONED,
};
struct find_free_extent_ctl {
/* Basic allocation info */
u64 ram_bytes;
u64 num_bytes;
u64 min_alloc_size;
u64 empty_size;
u64 flags;
int delalloc;
/* Where to start the search inside the bg */
u64 search_start;
/* For clustered allocation */
u64 empty_cluster;
struct btrfs_free_cluster *last_ptr;
bool use_cluster;
bool have_caching_bg;
bool orig_have_caching_bg;
/* Allocation is called for tree-log */
bool for_treelog;
/* Allocation is called for data relocation */
bool for_data_reloc;
/* RAID index, converted from flags */
int index;
/*
* Current loop number, check find_free_extent_update_loop() for details
*/
int loop;
/*
btrfs: wait on uncached block groups on every allocation loop My initial fix for the generic/475 hangs was related to metadata, but our CI testing uncovered another case where we hang for similar reasons. We again have a task with a plug that is holding an outstanding request that is keeping the dm device from finishing it's suspend, and that task is stuck in the allocator. This time it is stuck trying to allocate data, but we do not have a block group that matches the size class. The larger loop in the allocator looks like this (simplified of course) find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; do_allocation() btrfs_wait_block_group_cache_progress(); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; In my earlier fix we were trying to allocate from the block group, but we weren't waiting for the progress because we were only waiting for the free space to be >= the amount of free space we wanted. My fix made it so we waited for forward progress to be made as well, so we would be sure to wait. This time however we did not have a block group that matched our size class, so what was happening was this find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; if (size_class_doesn't_match()) goto loop; do_allocation() btrfs_wait_block_group_cache_progress(); loop: release_block_group(block_group); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; The size_class_doesn't_match() part was true, so we'd just skip this block group and never wait for caching, and then because we found a caching block group we'd just go back and do the loop again. We never sleep and thus never flush the plug and we have the same deadlock. Fix the logic for waiting on the block group caching to instead do it unconditionally when we goto loop. This takes the logic out of the allocation step, so now the loop looks more like this find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; if (size_class_doesn't_match()) goto loop; do_allocation() btrfs_wait_block_group_cache_progress(); loop: if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached && !ffe_ctl->cached) { ffe_ctl->retry_uncached = true; btrfs_wait_block_group_cache_progress(); } release_block_group(block_group); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; This simplifies the logic a lot, and makes sure that if we're hitting uncached block groups we're always waiting on them at some point. I ran this through 100 iterations of generic/475, as this particular case was harder to hit than the previous one. Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-08-01 04:28:43 +08:00
* Set to true if we're retrying the allocation on this block group
* after waiting for caching progress, this is so that we retry only
* once before moving on to another block group.
*/
btrfs: wait on uncached block groups on every allocation loop My initial fix for the generic/475 hangs was related to metadata, but our CI testing uncovered another case where we hang for similar reasons. We again have a task with a plug that is holding an outstanding request that is keeping the dm device from finishing it's suspend, and that task is stuck in the allocator. This time it is stuck trying to allocate data, but we do not have a block group that matches the size class. The larger loop in the allocator looks like this (simplified of course) find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; do_allocation() btrfs_wait_block_group_cache_progress(); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; In my earlier fix we were trying to allocate from the block group, but we weren't waiting for the progress because we were only waiting for the free space to be >= the amount of free space we wanted. My fix made it so we waited for forward progress to be made as well, so we would be sure to wait. This time however we did not have a block group that matched our size class, so what was happening was this find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; if (size_class_doesn't_match()) goto loop; do_allocation() btrfs_wait_block_group_cache_progress(); loop: release_block_group(block_group); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; The size_class_doesn't_match() part was true, so we'd just skip this block group and never wait for caching, and then because we found a caching block group we'd just go back and do the loop again. We never sleep and thus never flush the plug and we have the same deadlock. Fix the logic for waiting on the block group caching to instead do it unconditionally when we goto loop. This takes the logic out of the allocation step, so now the loop looks more like this find_free_extent for_each_block_group { ffe_ctl->cached == btrfs_block_group_cache_done(bg) if (!ffe_ctl->cached) ffe_ctl->have_caching_bg = true; if (size_class_doesn't_match()) goto loop; do_allocation() btrfs_wait_block_group_cache_progress(); loop: if (loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached && !ffe_ctl->cached) { ffe_ctl->retry_uncached = true; btrfs_wait_block_group_cache_progress(); } release_block_group(block_group); } if (loop == LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) go search again; This simplifies the logic a lot, and makes sure that if we're hitting uncached block groups we're always waiting on them at some point. I ran this through 100 iterations of generic/475, as this particular case was harder to hit than the previous one. Signed-off-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-08-01 04:28:43 +08:00
bool retry_uncached;
/* If current block group is cached */
int cached;
/* Max contiguous hole found */
u64 max_extent_size;
/* Total free space from free space cache, not always contiguous */
u64 total_free_space;
/* Found result */
u64 found_offset;
/* Hint where to start looking for an empty space */
u64 hint_byte;
/* Allocation policy */
enum btrfs_extent_allocation_policy policy;
/* Whether or not the allocator is currently following a hint */
bool hinted;
btrfs: introduce size class to block group allocator The aim of this patch is to reduce the fragmentation of block groups under certain unhappy workloads. It is particularly effective when the size of extents correlates with their lifetime, which is something we have observed causing fragmentation in the fleet at Meta. This patch categorizes extents into size classes: - x < 128KiB: "small" - 128KiB < x < 8MiB: "medium" - x > 8MiB: "large" and as much as possible reduces allocations of extents into block groups that don't match the size class. This takes advantage of any (possible) correlation between size and lifetime and also leaves behind predictable re-usable gaps when extents are freed; small writes don't gum up bigger holes. Size classes are implemented in the following way: - Mark each new block group with a size class of the first allocation that goes into it. - Add two new passes to ffe: "unset size class" and "wrong size class". First, try only matching block groups, then try unset ones, then allow allocation of new ones, and finally allow mismatched block groups. - Filtering is done just by skipping inappropriate ones, there is no special size class indexing. Other solutions I considered were: - A best fit allocator with an rb-tree. This worked well, as small writes didn't leak big holes from large freed extents, but led to regressions in ffe and write performance due to lock contention on the rb-tree with every allocation possibly updating it in parallel. Perhaps something clever could be done to do the updates in the background while being "right enough". - A fixed size "working set". This prevents freeing an extent drastically changing where writes currently land, and seems like a good option too. Doesn't take advantage of size in any way. - The same size class idea, but implemented with xarray marks. This turned out to be slower than looping the linked list and skipping wrong block groups, and is also less flexible since we must have only 3 size classes (max #marks). With the current approach we can have as many as we like. Performance testing was done via: https://github.com/josefbacik/fsperf Of particular relevance are the new fragmentation specific tests. A brief summary of the testing results: - Neutral results on existing tests. There are some minor regressions and improvements here and there, but nothing that truly stands out as notable. - Improvement on new tests where size class and extent lifetime are correlated. Fragmentation in these cases is completely eliminated and write performance is generally a little better. There is also significant improvement where extent sizes are just a bit larger than the size class boundaries. - Regression on one new tests: where the allocations are sized intentionally a hair under the borders of the size classes. Results are neutral on the test that intentionally attacks this new scheme by mixing extent size and lifetime. The full dump of the performance results can be found here: https://bur.io/fsperf/size-class-2022-11-15.txt (there are ANSI escape codes, so best to curl and view in terminal) Here is a snippet from the full results for a new test which mixes buffered writes appending to a long lived set of files and large short lived fallocates: bufferedappendvsfallocate results metric baseline current stdev diff ====================================================================================== avg_commit_ms 31.13 29.20 2.67 -6.22% bg_count 14 15.60 0 11.43% commits 11.10 12.20 0.32 9.91% elapsed 27.30 26.40 2.98 -3.30% end_state_mount_ns 11122551.90 10635118.90 851143.04 -4.38% end_state_umount_ns 1.36e+09 1.35e+09 12248056.65 -1.07% find_free_extent_calls 116244.30 114354.30 964.56 -1.63% find_free_extent_ns_max 599507.20 1047168.20 103337.08 74.67% find_free_extent_ns_mean 3607.19 3672.11 101.20 1.80% find_free_extent_ns_min 500 512 6.67 2.40% find_free_extent_ns_p50 2848 2876 37.65 0.98% find_free_extent_ns_p95 4916 5000 75.45 1.71% find_free_extent_ns_p99 20734.49 20920.48 1670.93 0.90% frag_pct_max 61.67 0 8.05 -100.00% frag_pct_mean 43.59 0 6.10 -100.00% frag_pct_min 25.91 0 16.60 -100.00% frag_pct_p50 42.53 0 7.25 -100.00% frag_pct_p95 61.67 0 8.05 -100.00% frag_pct_p99 61.67 0 8.05 -100.00% fragmented_bg_count 6.10 0 1.45 -100.00% max_commit_ms 49.80 46 5.37 -7.63% sys_cpu 2.59 2.62 0.29 1.39% write_bw_bytes 1.62e+08 1.68e+08 17975843.50 3.23% write_clat_ns_mean 57426.39 54475.95 2292.72 -5.14% write_clat_ns_p50 46950.40 42905.60 2101.35 -8.62% write_clat_ns_p99 148070.40 143769.60 2115.17 -2.90% write_io_kbytes 4194304 4194304 0 0.00% write_iops 2476.15 2556.10 274.29 3.23% write_lat_ns_max 2101667.60 2251129.50 370556.59 7.11% write_lat_ns_mean 59374.91 55682.00 2523.09 -6.22% write_lat_ns_min 17353.10 16250 1646.08 -6.36% There are some mixed improvements/regressions in most metrics along with an elimination of fragmentation in this workload. On the balance, the drastic 1->0 improvement in the happy cases seems worth the mix of regressions and improvements we do observe. Some considerations for future work: - Experimenting with more size classes - More hinting/search ordering work to approximate a best-fit allocator Signed-off-by: Boris Burkov <boris@bur.io> Signed-off-by: David Sterba <dsterba@suse.com>
2022-12-16 08:06:33 +08:00
/* Size class of block groups to prefer in early loops */
enum btrfs_block_group_size_class size_class;
};
enum btrfs_inline_ref_type {
BTRFS_REF_TYPE_INVALID,
BTRFS_REF_TYPE_BLOCK,
BTRFS_REF_TYPE_DATA,
BTRFS_REF_TYPE_ANY,
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
btrfs: allow to run delayed refs by bytes to be released instead of count When running delayed references, through btrfs_run_delayed_refs(), we can specify how many to run, run all existing delayed references and keep running delayed references while we can find any. This is controlled with the value of the 'count' argument, where a value of 0 means to run all delayed references that exist by the time btrfs_run_delayed_refs() is called, (unsigned long)-1 means to keep running delayed references while we are able find any, and any other value to run that exact number of delayed references. Typically a specific value other than 0 or -1 is used when flushing space to try to release a certain amount of bytes for a ticket. In this case we just simply calculate how many delayed reference heads correspond to a specific amount of bytes, with calc_delayed_refs_nr(). However that only takes into account the space reserved for the reference heads themselves, and does not account for the space reserved for deleting checksums from the csum tree (see add_delayed_ref_head() and update_existing_head_ref()) in case we are going to delete a data extent. This means we may end up running more delayed references than necessary in case we process delayed references for deleting a data extent. So change the logic of btrfs_run_delayed_refs() to take a bytes argument to specify how many bytes of delayed references to run/release, using the special values of 0 to mean all existing delayed references and U64_MAX (or (u64)-1) to keep running delayed references while we can find any. This prevents running more delayed references than necessary, when we have delayed references for deleting data extents, but also makes the upcoming changes/patches simpler and it's preparatory work for them. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-09 01:20:34 +08:00
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes);
u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags,
u64 *owner_root);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
btrfs: qgroup: track metadata relocation COW with simple quota Relocation COWs metadata blocks in two cases for the reloc root: - copying the subvolume root item when creating the reloc root - copying a btree node when there is a COW during relocation In both cases, the resulting btree node hits an abnormal code path with respect to the owner field in its btrfs_header. It first creates the root item for the new objectid, which populates the reloc root id, and it at this point that delayed refs are created. Later, it fully copies the old node into the new node (including the original owner field) which overwrites it. This results in a simple quotas mismatch where we run the delayed ref for the reloc root which has no simple quota effect (reloc root is not an fstree) but when we ultimately delete the node, the owner is the real original fstree and we do free the space. To work around this without tampering with the behavior of relocation, add a parameter to btrfs_add_tree_block that lets the relocation code path specify a different owning root than the "operating" root (in this case, owning root is the real root and the operating root is the reloc root). These can naturally be plumbed into delayed refs that have the same concept. Note that this is a double count in some sense, but a relatively natural one, as there are really two extents, and the old one will be deleted soon. This is consistent with how data relocation extents are accounted by simple quotas. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Boris Burkov <boris@bur.io> Signed-off-by: David Sterba <dsterba@suse.com>
2023-06-22 08:37:23 +08:00
u64 reloc_src_root,
enum btrfs_lock_nesting nest);
int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf, int slot);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
#endif