mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-16 07:24:39 +08:00
adb86dbe42
Currently when reserving space for deleting the csum items for a data extent, when adding or updating a delayed ref head, we determine how many leaves of csum items we can have and then pass that number to the helper btrfs_calc_delayed_ref_bytes(). This helper is used for calculating space for all tree modifications we need when running delayed references, however the amount of space it computes is excessive for deleting csum items because: 1) It uses btrfs_calc_insert_metadata_size() which is excessive because we only need to delete csum items from the csum tree, we don't need to insert any items, so btrfs_calc_metadata_size() is all we need (as it computes space needed to delete an item); 2) If the free space tree is enabled, it doubles the amount of space, which is pointless for csum deletion since we don't need to touch the free space tree or any other tree other than the csum tree. So improve on this by tracking how many csum deletions we have and using a new helper to calculate space for csum deletions (just a wrapper around btrfs_calc_metadata_size() with a comment). This reduces the amount of space we need to reserve for csum deletions by a factor of 4, and it helps reduce the number of times we have to block space reservations and have the reclaim task enter the space flushing algorithm (flush delayed items, flush delayed refs, etc) in order to satisfy tickets. For example this results in a total time decrease when unlinking (or truncating) files with many extents, as we end up having to block on space metadata reservations less often. Example test: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/test umount $DEV &> /dev/null mkfs.btrfs -f $DEV # Use compression to quickly create files with a lot of extents # (each with a size of 128K). mount -o compress=lzo $DEV $MNT # 100G gives at least 983040 extents with a size of 128K. xfs_io -f -c "pwrite -S 0xab -b 1M 0 120G" $MNT/foobar # Flush all delalloc and clear all metadata from memory. umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) rm -f $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "rm took $dur milliseconds" umount $MNT Before this change rm took: 7504 milliseconds After this change rm took: 6574 milliseconds (-12.4%) Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
274 lines
8.7 KiB
C
274 lines
8.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_TRANSACTION_H
|
|
#define BTRFS_TRANSACTION_H
|
|
|
|
#include <linux/refcount.h>
|
|
#include "btrfs_inode.h"
|
|
#include "delayed-ref.h"
|
|
#include "ctree.h"
|
|
#include "misc.h"
|
|
|
|
enum btrfs_trans_state {
|
|
TRANS_STATE_RUNNING,
|
|
TRANS_STATE_COMMIT_PREP,
|
|
TRANS_STATE_COMMIT_START,
|
|
TRANS_STATE_COMMIT_DOING,
|
|
TRANS_STATE_UNBLOCKED,
|
|
TRANS_STATE_SUPER_COMMITTED,
|
|
TRANS_STATE_COMPLETED,
|
|
TRANS_STATE_MAX,
|
|
};
|
|
|
|
#define BTRFS_TRANS_HAVE_FREE_BGS 0
|
|
#define BTRFS_TRANS_DIRTY_BG_RUN 1
|
|
#define BTRFS_TRANS_CACHE_ENOSPC 2
|
|
|
|
struct btrfs_transaction {
|
|
u64 transid;
|
|
/*
|
|
* total external writers(USERSPACE/START/ATTACH) in this
|
|
* transaction, it must be zero before the transaction is
|
|
* being committed
|
|
*/
|
|
atomic_t num_extwriters;
|
|
/*
|
|
* total writers in this transaction, it must be zero before the
|
|
* transaction can end
|
|
*/
|
|
atomic_t num_writers;
|
|
refcount_t use_count;
|
|
|
|
unsigned long flags;
|
|
|
|
/* Be protected by fs_info->trans_lock when we want to change it. */
|
|
enum btrfs_trans_state state;
|
|
int aborted;
|
|
struct list_head list;
|
|
struct extent_io_tree dirty_pages;
|
|
time64_t start_time;
|
|
wait_queue_head_t writer_wait;
|
|
wait_queue_head_t commit_wait;
|
|
struct list_head pending_snapshots;
|
|
struct list_head dev_update_list;
|
|
struct list_head switch_commits;
|
|
struct list_head dirty_bgs;
|
|
|
|
/*
|
|
* There is no explicit lock which protects io_bgs, rather its
|
|
* consistency is implied by the fact that all the sites which modify
|
|
* it do so under some form of transaction critical section, namely:
|
|
*
|
|
* - btrfs_start_dirty_block_groups - This function can only ever be
|
|
* run by one of the transaction committers. Refer to
|
|
* BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
|
|
*
|
|
* - btrfs_write_dirty_blockgroups - this is called by
|
|
* commit_cowonly_roots from transaction critical section
|
|
* (TRANS_STATE_COMMIT_DOING)
|
|
*
|
|
* - btrfs_cleanup_dirty_bgs - called on transaction abort
|
|
*/
|
|
struct list_head io_bgs;
|
|
struct list_head dropped_roots;
|
|
struct extent_io_tree pinned_extents;
|
|
|
|
/*
|
|
* we need to make sure block group deletion doesn't race with
|
|
* free space cache writeout. This mutex keeps them from stomping
|
|
* on each other
|
|
*/
|
|
struct mutex cache_write_mutex;
|
|
spinlock_t dirty_bgs_lock;
|
|
/* Protected by spin lock fs_info->unused_bgs_lock. */
|
|
struct list_head deleted_bgs;
|
|
spinlock_t dropped_roots_lock;
|
|
struct btrfs_delayed_ref_root delayed_refs;
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
/*
|
|
* Number of ordered extents the transaction must wait for before
|
|
* committing. These are ordered extents started by a fast fsync.
|
|
*/
|
|
atomic_t pending_ordered;
|
|
wait_queue_head_t pending_wait;
|
|
};
|
|
|
|
enum {
|
|
ENUM_BIT(__TRANS_FREEZABLE),
|
|
ENUM_BIT(__TRANS_START),
|
|
ENUM_BIT(__TRANS_ATTACH),
|
|
ENUM_BIT(__TRANS_JOIN),
|
|
ENUM_BIT(__TRANS_JOIN_NOLOCK),
|
|
ENUM_BIT(__TRANS_DUMMY),
|
|
ENUM_BIT(__TRANS_JOIN_NOSTART),
|
|
};
|
|
|
|
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
|
|
#define TRANS_ATTACH (__TRANS_ATTACH)
|
|
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
|
|
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
|
|
#define TRANS_JOIN_NOSTART (__TRANS_JOIN_NOSTART)
|
|
|
|
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
|
|
|
|
struct btrfs_trans_handle {
|
|
u64 transid;
|
|
u64 bytes_reserved;
|
|
u64 chunk_bytes_reserved;
|
|
unsigned long delayed_ref_updates;
|
|
unsigned long delayed_ref_csum_deletions;
|
|
struct btrfs_transaction *transaction;
|
|
struct btrfs_block_rsv *block_rsv;
|
|
struct btrfs_block_rsv *orig_rsv;
|
|
/* Set by a task that wants to create a snapshot. */
|
|
struct btrfs_pending_snapshot *pending_snapshot;
|
|
refcount_t use_count;
|
|
unsigned int type;
|
|
/*
|
|
* Error code of transaction abort, set outside of locks and must use
|
|
* the READ_ONCE/WRITE_ONCE access
|
|
*/
|
|
short aborted;
|
|
bool adding_csums;
|
|
bool allocating_chunk;
|
|
bool removing_chunk;
|
|
bool reloc_reserved;
|
|
bool in_fsync;
|
|
struct btrfs_fs_info *fs_info;
|
|
struct list_head new_bgs;
|
|
};
|
|
|
|
/*
|
|
* The abort status can be changed between calls and is not protected by locks.
|
|
* This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
|
|
* set to a non-zero value it does not change, so the macro should be in checks
|
|
* but is not necessary for further reads of the value.
|
|
*/
|
|
#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
|
|
|
|
struct btrfs_pending_snapshot {
|
|
struct dentry *dentry;
|
|
struct inode *dir;
|
|
struct btrfs_root *root;
|
|
struct btrfs_root_item *root_item;
|
|
struct btrfs_root *snap;
|
|
struct btrfs_qgroup_inherit *inherit;
|
|
struct btrfs_path *path;
|
|
/* block reservation for the operation */
|
|
struct btrfs_block_rsv block_rsv;
|
|
/* extra metadata reservation for relocation */
|
|
int error;
|
|
/* Preallocated anonymous block device number */
|
|
dev_t anon_dev;
|
|
bool readonly;
|
|
struct list_head list;
|
|
};
|
|
|
|
static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *inode)
|
|
{
|
|
spin_lock(&inode->lock);
|
|
inode->last_trans = trans->transaction->transid;
|
|
inode->last_sub_trans = inode->root->log_transid;
|
|
inode->last_log_commit = inode->last_sub_trans - 1;
|
|
spin_unlock(&inode->lock);
|
|
}
|
|
|
|
/*
|
|
* Make qgroup codes to skip given qgroupid, means the old/new_roots for
|
|
* qgroup won't contain the qgroupid in it.
|
|
*/
|
|
static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
|
|
u64 qgroupid)
|
|
{
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
WARN_ON(delayed_refs->qgroup_to_skip);
|
|
delayed_refs->qgroup_to_skip = qgroupid;
|
|
}
|
|
|
|
static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
|
|
{
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
WARN_ON(!delayed_refs->qgroup_to_skip);
|
|
delayed_refs->qgroup_to_skip = 0;
|
|
}
|
|
|
|
bool __cold abort_should_print_stack(int errno);
|
|
|
|
/*
|
|
* Call btrfs_abort_transaction as early as possible when an error condition is
|
|
* detected, that way the exact stack trace is reported for some errors.
|
|
*/
|
|
#define btrfs_abort_transaction(trans, errno) \
|
|
do { \
|
|
bool first = false; \
|
|
/* Report first abort since mount */ \
|
|
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
|
|
&((trans)->fs_info->fs_state))) { \
|
|
first = true; \
|
|
if (WARN(abort_should_print_stack(errno), \
|
|
KERN_ERR \
|
|
"BTRFS: Transaction aborted (error %d)\n", \
|
|
(errno))) { \
|
|
/* Stack trace printed. */ \
|
|
} else { \
|
|
btrfs_err((trans)->fs_info, \
|
|
"Transaction aborted (error %d)", \
|
|
(errno)); \
|
|
} \
|
|
} \
|
|
__btrfs_abort_transaction((trans), __func__, \
|
|
__LINE__, (errno), first); \
|
|
} while (0)
|
|
|
|
int btrfs_end_transaction(struct btrfs_trans_handle *trans);
|
|
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
|
|
unsigned int num_items);
|
|
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
|
|
struct btrfs_root *root,
|
|
unsigned int num_items);
|
|
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
|
|
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
|
|
struct btrfs_root *root);
|
|
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
|
|
|
|
void btrfs_add_dead_root(struct btrfs_root *root);
|
|
int btrfs_defrag_root(struct btrfs_root *root);
|
|
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
|
|
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
|
|
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
|
|
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
|
|
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
|
|
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
|
|
void btrfs_throttle(struct btrfs_fs_info *fs_info);
|
|
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root);
|
|
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
|
|
struct extent_io_tree *dirty_pages, int mark);
|
|
int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
|
|
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
|
|
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
|
|
void btrfs_put_transaction(struct btrfs_transaction *transaction);
|
|
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root);
|
|
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
|
|
void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
|
|
const char *function,
|
|
unsigned int line, int errno, bool first_hit);
|
|
|
|
int __init btrfs_transaction_init(void);
|
|
void __cold btrfs_transaction_exit(void);
|
|
|
|
#endif
|