mirror of
https://github.com/edk2-porting/linux-next.git
synced 2024-12-17 09:43:59 +08:00
for-5.13-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmCHGMYACgkQxWXV+ddt WDsFeA/+MZ+5UiYYucH5RVw/VExOQzSvlRVxxnOeR2s8V/gFj/Ip7d9E9UezA+lX di2byKXPzfjL9+xoyqfEZLpPgDQrhHTIQCzuNSvzMBykJIL6Sf1OZTtUZWU3HDH7 S51UZtghgTPzeOhxsiBHqSFo9danT0w3KQhliE10Ur855ziKSvL2Tb7dvM6q5TS7 mFTAj/Y2aanDaFKQjjBzzA+GZ0LFIGuErg1PADmF5XjbyY06ho1xqQ1A0t2/XL9x UpHdRP3E5XRAMl4uyYOUtbvUB1cROzoS6ySHJOJ9Bbz+IC0cLf5xTJkLE25bGkSi GjFNvnQOha1s8oMIlqkw64hKQqwp+gu2iZ7m1o76Z31k7CpLAC+rg11gbpODRuoh 7B3EzowKyVihMHF8URAdC3A+9gbpPvyuGKDSy07yULh/2vas6dEzR3cPVEU0yXyJ 3DO1ds0lVY3B/T9LKPQ785hQ7VdpgZ8BdIOVRtjgV2QQEa9eFh9VzybQjU8yBXGd vflBe8kQfASIZ5E0rcUGPUVIJoesM8U1pSlx9jvvTQVkOC/DQjtBx/5ePCL2iVfd izY8uWlCdguF/P1CYFf1M0auASSzl3bip1NnSMVvZ90dgDEK4XaIyd16kMGDCbU2 UMOePMsLDApWcCVTqM/J+lFLa7rajRccdKby7F/zSpZIRgadPF8= =J5jh -----END PGP SIGNATURE----- Merge tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs updates from David Sterba: "The updates this time are mostly stabilization, preparation and minor improvements. User visible improvements: - readahead for send, improving run time of full send by 10% and for incremental by 25% - make reflinks respect O_SYNC, O_DSYNC and S_SYNC flags - export supported sectorsize values in sysfs (currently only page size, more once full subpage support lands) - more graceful errors and warnings on 32bit systems when logical addresses for metadata reach the limit posed by unsigned long in page::index - error: fail mount if there's a metadata block beyond the limit - error: new metadata block would be at unreachable address - warn when 5/8th of the limit is reached, for 4K page systems it's 10T, for 64K page it's 160T - zoned mode - relocated zones get reset at the end instead of discard - automatic background reclaim of zones that have 75%+ of unusable space, the threshold is tunable in sysfs Fixes: - fsync and tree mod log fixes - fix inefficient preemptive reclaim calculations - fix exhaustion of the system chunk array due to concurrent allocations - fix fallback to no compression when racing with remount - preemptive fix for dm-crypt on zoned device that does not properly advertise zoned support Core changes: - add inode lock to synchronize mmap and other block updates (eg. deduplication, fallocate, fsync) - kmap conversions to new kmap_local API - subpage support (continued) - new helpers for page state/extent buffer tracking - metadata changes now support read and write - error handling through out relocation call paths - many other cleanups and code simplifications" * tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (112 commits) btrfs: zoned: automatically reclaim zones btrfs: rename delete_unused_bgs_mutex to reclaim_bgs_lock btrfs: zoned: reset zones of relocated block groups btrfs: more graceful errors/warnings on 32bit systems when reaching limits btrfs: zoned: fix unpaired block group unfreeze during device replace btrfs: fix race when picking most recent mod log operation for an old root btrfs: fix metadata extent leak after failure to create subvolume btrfs: handle remount to no compress during compression btrfs: zoned: fail mount if the device does not support zone append btrfs: fix race between transaction aborts and fsyncs leading to use-after-free btrfs: introduce submit_eb_subpage() to submit a subpage metadata page btrfs: make lock_extent_buffer_for_io() to be subpage compatible btrfs: introduce write_one_subpage_eb() function btrfs: introduce end_bio_subpage_eb_writepage() function btrfs: check return value of btrfs_commit_transaction in relocation btrfs: do proper error handling in merge_reloc_roots btrfs: handle extent corruption with select_one_root properly btrfs: cleanup error handling in prepare_to_merge btrfs: do not panic in __add_reloc_root btrfs: handle __add_reloc_root failures in btrfs_recover_relocation ...
This commit is contained in:
commit
55ba0fe059
@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
|
||||
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
|
||||
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
|
||||
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
|
||||
subpage.o
|
||||
subpage.o tree-mod-log.o
|
||||
|
||||
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
|
||||
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "delayed-ref.h"
|
||||
#include "locking.h"
|
||||
#include "misc.h"
|
||||
#include "tree-mod-log.h"
|
||||
|
||||
/* Just an arbitrary number so we can be sure this happened */
|
||||
#define BACKREF_FOUND_SHARED 6
|
||||
@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
|
||||
if (path->slots[0] >= btrfs_header_nritems(eb) ||
|
||||
is_shared_data_backref(preftrees, eb->start) ||
|
||||
ref->root_id != btrfs_header_owner(eb)) {
|
||||
if (time_seq == SEQ_LAST)
|
||||
if (time_seq == BTRFS_SEQ_LAST)
|
||||
ret = btrfs_next_leaf(root, path);
|
||||
else
|
||||
ret = btrfs_next_old_leaf(root, path, time_seq);
|
||||
@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
|
||||
if (slot == 0 &&
|
||||
(is_shared_data_backref(preftrees, eb->start) ||
|
||||
ref->root_id != btrfs_header_owner(eb))) {
|
||||
if (time_seq == SEQ_LAST)
|
||||
if (time_seq == BTRFS_SEQ_LAST)
|
||||
ret = btrfs_next_leaf(root, path);
|
||||
else
|
||||
ret = btrfs_next_old_leaf(root, path, time_seq);
|
||||
@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
|
||||
eie = NULL;
|
||||
}
|
||||
next:
|
||||
if (time_seq == SEQ_LAST)
|
||||
if (time_seq == BTRFS_SEQ_LAST)
|
||||
ret = btrfs_next_item(root, path);
|
||||
else
|
||||
ret = btrfs_next_old_item(root, path, time_seq);
|
||||
@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
|
||||
|
||||
if (path->search_commit_root)
|
||||
root_level = btrfs_header_level(root->commit_root);
|
||||
else if (time_seq == SEQ_LAST)
|
||||
else if (time_seq == BTRFS_SEQ_LAST)
|
||||
root_level = btrfs_header_level(root->node);
|
||||
else
|
||||
root_level = btrfs_old_root_level(root, time_seq);
|
||||
@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
|
||||
search_key.offset >= LLONG_MAX)
|
||||
search_key.offset = 0;
|
||||
path->lowest_level = level;
|
||||
if (time_seq == SEQ_LAST)
|
||||
if (time_seq == BTRFS_SEQ_LAST)
|
||||
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
|
||||
else
|
||||
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
|
||||
@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
|
||||
* indirect refs to their parent bytenr.
|
||||
* When roots are found, they're added to the roots list
|
||||
*
|
||||
* If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
|
||||
* much like trans == NULL case, the difference only lies in it will not
|
||||
* If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
|
||||
* behave much like trans == NULL case, the difference only lies in it will not
|
||||
* commit root.
|
||||
* The special case is for qgroup to search roots in commit_transaction().
|
||||
*
|
||||
@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
|
||||
path->skip_locking = 1;
|
||||
}
|
||||
|
||||
if (time_seq == SEQ_LAST)
|
||||
if (time_seq == BTRFS_SEQ_LAST)
|
||||
path->skip_locking = 1;
|
||||
|
||||
/*
|
||||
@ -1217,9 +1218,9 @@ again:
|
||||
|
||||
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
||||
if (trans && likely(trans->type != __TRANS_DUMMY) &&
|
||||
time_seq != SEQ_LAST) {
|
||||
time_seq != BTRFS_SEQ_LAST) {
|
||||
#else
|
||||
if (trans && time_seq != SEQ_LAST) {
|
||||
if (trans && time_seq != BTRFS_SEQ_LAST) {
|
||||
#endif
|
||||
/*
|
||||
* look if there are updates for this ref queued and lock the
|
||||
@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct ulist_iterator uiter;
|
||||
struct ulist_node *node;
|
||||
struct seq_list elem = SEQ_LIST_INIT(elem);
|
||||
struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
|
||||
int ret = 0;
|
||||
struct share_check shared = {
|
||||
.root_objectid = root->root_key.objectid,
|
||||
@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
|
||||
struct ulist *roots = NULL;
|
||||
struct ulist_node *ref_node = NULL;
|
||||
struct ulist_node *root_node = NULL;
|
||||
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
|
||||
struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
|
||||
struct ulist_iterator ref_uiter;
|
||||
struct ulist_iterator root_uiter;
|
||||
|
||||
@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
|
||||
}
|
||||
|
||||
if (trans)
|
||||
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
|
||||
btrfs_get_tree_mod_seq(fs_info, &seq_elem);
|
||||
else
|
||||
down_read(&fs_info->commit_root_sem);
|
||||
|
||||
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
|
||||
tree_mod_seq_elem.seq, &refs,
|
||||
seq_elem.seq, &refs,
|
||||
&extent_item_pos, ignore_offset);
|
||||
if (ret)
|
||||
goto out;
|
||||
@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
|
||||
ULIST_ITER_INIT(&ref_uiter);
|
||||
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
|
||||
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
|
||||
tree_mod_seq_elem.seq, &roots,
|
||||
seq_elem.seq, &roots,
|
||||
ignore_offset);
|
||||
if (ret)
|
||||
break;
|
||||
@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
|
||||
free_leaf_list(refs);
|
||||
out:
|
||||
if (trans) {
|
||||
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
|
||||
btrfs_put_tree_mod_seq(fs_info, &seq_elem);
|
||||
btrfs_end_transaction(trans);
|
||||
} else {
|
||||
up_read(&fs_info->commit_root_sem);
|
||||
|
@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
||||
* Long running balances can keep us blocked here for eternity, so
|
||||
* simply skip deletion if we're unable to get the mutex.
|
||||
*/
|
||||
if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
|
||||
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
|
||||
return;
|
||||
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
@ -1462,12 +1462,12 @@ next:
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
}
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
return;
|
||||
|
||||
flip_async:
|
||||
btrfs_end_transaction(trans);
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
btrfs_put_block_group(block_group);
|
||||
btrfs_discard_punt_unused_bgs_list(fs_info);
|
||||
}
|
||||
@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
}
|
||||
|
||||
void btrfs_reclaim_bgs_work(struct work_struct *work)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info =
|
||||
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
|
||||
struct btrfs_block_group *bg;
|
||||
struct btrfs_space_info *space_info;
|
||||
int ret;
|
||||
|
||||
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
|
||||
return;
|
||||
|
||||
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
|
||||
return;
|
||||
|
||||
mutex_lock(&fs_info->reclaim_bgs_lock);
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
while (!list_empty(&fs_info->reclaim_bgs)) {
|
||||
bg = list_first_entry(&fs_info->reclaim_bgs,
|
||||
struct btrfs_block_group,
|
||||
bg_list);
|
||||
list_del_init(&bg->bg_list);
|
||||
|
||||
space_info = bg->space_info;
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
|
||||
/* Don't race with allocators so take the groups_sem */
|
||||
down_write(&space_info->groups_sem);
|
||||
|
||||
spin_lock(&bg->lock);
|
||||
if (bg->reserved || bg->pinned || bg->ro) {
|
||||
/*
|
||||
* We want to bail if we made new allocations or have
|
||||
* outstanding allocations in this block group. We do
|
||||
* the ro check in case balance is currently acting on
|
||||
* this block group.
|
||||
*/
|
||||
spin_unlock(&bg->lock);
|
||||
up_write(&space_info->groups_sem);
|
||||
goto next;
|
||||
}
|
||||
spin_unlock(&bg->lock);
|
||||
|
||||
/* Get out fast, in case we're unmounting the filesystem */
|
||||
if (btrfs_fs_closing(fs_info)) {
|
||||
up_write(&space_info->groups_sem);
|
||||
goto next;
|
||||
}
|
||||
|
||||
ret = inc_block_group_ro(bg, 0);
|
||||
up_write(&space_info->groups_sem);
|
||||
if (ret < 0)
|
||||
goto next;
|
||||
|
||||
btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
|
||||
bg->start, div_u64(bg->used * 100, bg->length));
|
||||
trace_btrfs_reclaim_block_group(bg);
|
||||
ret = btrfs_relocate_chunk(fs_info, bg->start);
|
||||
if (ret)
|
||||
btrfs_err(fs_info, "error relocating chunk %llu",
|
||||
bg->start);
|
||||
|
||||
next:
|
||||
btrfs_put_block_group(bg);
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
}
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
btrfs_exclop_finish(fs_info);
|
||||
}
|
||||
|
||||
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
if (!list_empty(&fs_info->reclaim_bgs))
|
||||
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
}
|
||||
|
||||
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = bg->fs_info;
|
||||
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
if (list_empty(&bg->bg_list)) {
|
||||
btrfs_get_block_group(bg);
|
||||
trace_btrfs_add_reclaim_block_group(bg);
|
||||
list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
|
||||
}
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
}
|
||||
|
||||
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
|
||||
struct btrfs_path *path)
|
||||
{
|
||||
@ -2267,16 +2358,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
|
||||
struct btrfs_trans_handle *trans;
|
||||
u64 alloc_flags;
|
||||
int ret;
|
||||
bool dirty_bg_running;
|
||||
|
||||
again:
|
||||
do {
|
||||
trans = btrfs_join_transaction(fs_info->extent_root);
|
||||
if (IS_ERR(trans))
|
||||
return PTR_ERR(trans);
|
||||
|
||||
dirty_bg_running = false;
|
||||
|
||||
/*
|
||||
* we're not allowed to set block groups readonly after the dirty
|
||||
* block groups cache has started writing. If it already started,
|
||||
* back off and let this transaction commit
|
||||
* We're not allowed to set block groups readonly after the dirty
|
||||
* block group cache has started writing. If it already started,
|
||||
* back off and let this transaction commit.
|
||||
*/
|
||||
mutex_lock(&fs_info->ro_block_group_mutex);
|
||||
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
|
||||
@ -2288,8 +2382,9 @@ again:
|
||||
ret = btrfs_wait_for_commit(fs_info, transid);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto again;
|
||||
dirty_bg_running = true;
|
||||
}
|
||||
} while (dirty_bg_running);
|
||||
|
||||
if (do_chunk_alloc) {
|
||||
/*
|
||||
@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
|
||||
*/
|
||||
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
||||
{
|
||||
struct btrfs_transaction *cur_trans = trans->transaction;
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
struct btrfs_space_info *info;
|
||||
u64 left;
|
||||
@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
||||
lockdep_assert_held(&fs_info->chunk_mutex);
|
||||
|
||||
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
|
||||
again:
|
||||
spin_lock(&info->lock);
|
||||
left = info->total_bytes - btrfs_space_info_used(info, true);
|
||||
spin_unlock(&info->lock);
|
||||
@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
||||
|
||||
if (left < thresh) {
|
||||
u64 flags = btrfs_system_alloc_profile(fs_info);
|
||||
u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
|
||||
|
||||
/*
|
||||
* If there's not available space for the chunk tree (system
|
||||
* space) and there are other tasks that reserved space for
|
||||
* creating a new system block group, wait for them to complete
|
||||
* the creation of their system block group and release excess
|
||||
* reserved space. We do this because:
|
||||
*
|
||||
* *) We can end up allocating more system chunks than necessary
|
||||
* when there are multiple tasks that are concurrently
|
||||
* allocating block groups, which can lead to exhaustion of
|
||||
* the system array in the superblock;
|
||||
*
|
||||
* *) If we allocate extra and unnecessary system block groups,
|
||||
* despite being empty for a long time, and possibly forever,
|
||||
* they end not being added to the list of unused block groups
|
||||
* because that typically happens only when deallocating the
|
||||
* last extent from a block group - which never happens since
|
||||
* we never allocate from them in the first place. The few
|
||||
* exceptions are when mounting a filesystem or running scrub,
|
||||
* which add unused block groups to the list of unused block
|
||||
* groups, to be deleted by the cleaner kthread.
|
||||
* And even when they are added to the list of unused block
|
||||
* groups, it can take a long time until they get deleted,
|
||||
* since the cleaner kthread might be sleeping or busy with
|
||||
* other work (deleting subvolumes, running delayed iputs,
|
||||
* defrag scheduling, etc);
|
||||
*
|
||||
* This is rare in practice, but can happen when too many tasks
|
||||
* are allocating blocks groups in parallel (via fallocate())
|
||||
* and before the one that reserved space for a new system block
|
||||
* group finishes the block group creation and releases the space
|
||||
* reserved in excess (at btrfs_create_pending_block_groups()),
|
||||
* other tasks end up here and see free system space temporarily
|
||||
* not enough for updating the chunk tree.
|
||||
*
|
||||
* We unlock the chunk mutex before waiting for such tasks and
|
||||
* lock it again after the wait, otherwise we would deadlock.
|
||||
* It is safe to do so because allocating a system chunk is the
|
||||
* first thing done while allocating a new block group.
|
||||
*/
|
||||
if (reserved > trans->chunk_bytes_reserved) {
|
||||
const u64 min_needed = reserved - thresh;
|
||||
|
||||
mutex_unlock(&fs_info->chunk_mutex);
|
||||
wait_event(cur_trans->chunk_reserve_wait,
|
||||
atomic64_read(&cur_trans->chunk_bytes_reserved) <=
|
||||
min_needed);
|
||||
mutex_lock(&fs_info->chunk_mutex);
|
||||
goto again;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ignore failure to create system chunk. We might end up not
|
||||
@ -3315,9 +3464,11 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
||||
ret = btrfs_block_rsv_add(fs_info->chunk_root,
|
||||
&fs_info->chunk_block_rsv,
|
||||
thresh, BTRFS_RESERVE_NO_FLUSH);
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
|
||||
trans->chunk_bytes_reserved += thresh;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
|
||||
@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
|
||||
}
|
||||
spin_unlock(&info->unused_bgs_lock);
|
||||
|
||||
spin_lock(&info->unused_bgs_lock);
|
||||
while (!list_empty(&info->reclaim_bgs)) {
|
||||
block_group = list_first_entry(&info->reclaim_bgs,
|
||||
struct btrfs_block_group,
|
||||
bg_list);
|
||||
list_del_init(&block_group->bg_list);
|
||||
btrfs_put_block_group(block_group);
|
||||
}
|
||||
spin_unlock(&info->unused_bgs_lock);
|
||||
|
||||
spin_lock(&info->block_group_cache_lock);
|
||||
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
|
||||
block_group = rb_entry(n, struct btrfs_block_group,
|
||||
|
@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
||||
u64 group_start, struct extent_map *em);
|
||||
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
|
||||
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
|
||||
void btrfs_reclaim_bgs_work(struct work_struct *work);
|
||||
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
|
||||
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
|
||||
int btrfs_read_block_groups(struct btrfs_fs_info *info);
|
||||
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
|
||||
u64 type, u64 chunk_offset, u64 size);
|
||||
|
@ -220,6 +220,7 @@ struct btrfs_inode {
|
||||
/* Hook into fs_info->delayed_iputs */
|
||||
struct list_head delayed_iput;
|
||||
|
||||
struct rw_semaphore i_mmap_lock;
|
||||
struct inode vfs_inode;
|
||||
};
|
||||
|
||||
@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
|
||||
mod);
|
||||
}
|
||||
|
||||
static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
|
||||
/*
|
||||
* Called every time after doing a buffered, direct IO or memory mapped write.
|
||||
*
|
||||
* This is to ensure that if we write to a file that was previously fsynced in
|
||||
* the current transaction, then try to fsync it again in the same transaction,
|
||||
* we will know that there were changes in the file and that it needs to be
|
||||
* logged.
|
||||
*/
|
||||
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
|
||||
{
|
||||
int ret = 0;
|
||||
spin_lock(&inode->lock);
|
||||
inode->last_sub_trans = inode->root->log_transid;
|
||||
spin_unlock(&inode->lock);
|
||||
}
|
||||
|
||||
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
spin_lock(&inode->lock);
|
||||
if (inode->logged_trans == generation &&
|
||||
inode->last_sub_trans <= inode->last_log_commit &&
|
||||
inode->last_sub_trans <= inode->root->last_log_commit) {
|
||||
/*
|
||||
* After a ranged fsync we might have left some extent maps
|
||||
* (that fall outside the fsync's range). So return false
|
||||
* here if the list isn't empty, to make sure btrfs_log_inode()
|
||||
* will be called and process those extent maps.
|
||||
*/
|
||||
smp_mb();
|
||||
if (list_empty(&inode->extent_tree.modified_extents))
|
||||
ret = 1;
|
||||
}
|
||||
inode->last_sub_trans <= inode->root->last_log_commit)
|
||||
ret = true;
|
||||
spin_unlock(&inode->lock);
|
||||
return ret;
|
||||
}
|
||||
|
@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
|
||||
BUG_ON(!block_ctx->pagev);
|
||||
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
|
||||
PAGE_SHIFT;
|
||||
/* Pages must be unmapped in reverse order */
|
||||
while (num_pages > 0) {
|
||||
num_pages--;
|
||||
if (block_ctx->datav[num_pages]) {
|
||||
kunmap(block_ctx->pagev[num_pages]);
|
||||
kunmap_local(block_ctx->datav[num_pages]);
|
||||
block_ctx->datav[num_pages] = NULL;
|
||||
}
|
||||
if (block_ctx->pagev[num_pages]) {
|
||||
@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
|
||||
i = j;
|
||||
}
|
||||
for (i = 0; i < num_pages; i++)
|
||||
block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
|
||||
block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
|
||||
|
||||
return block_ctx->len;
|
||||
}
|
||||
@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
|
||||
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
|
||||
if (NULL != dev_state &&
|
||||
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
|
||||
unsigned int i = 0;
|
||||
int i = 0;
|
||||
u64 dev_bytenr;
|
||||
u64 cur_bytenr;
|
||||
struct bio_vec bvec;
|
||||
@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
|
||||
|
||||
bio_for_each_segment(bvec, bio, iter) {
|
||||
BUG_ON(bvec.bv_len != PAGE_SIZE);
|
||||
mapped_datav[i] = kmap(bvec.bv_page);
|
||||
mapped_datav[i] = kmap_local_page(bvec.bv_page);
|
||||
i++;
|
||||
|
||||
if (dev_state->state->print_mask &
|
||||
@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
|
||||
mapped_datav, segs,
|
||||
bio, &bio_is_patched,
|
||||
bio->bi_opf);
|
||||
bio_for_each_segment(bvec, bio, iter)
|
||||
kunmap(bvec.bv_page);
|
||||
/* Unmap in reverse order */
|
||||
for (--i; i >= 0; i--)
|
||||
kunmap_local(mapped_datav[i]);
|
||||
kfree(mapped_datav);
|
||||
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
|
||||
if (dev_state->state->print_mask &
|
||||
|
@ -80,10 +80,15 @@ static int compression_compress_pages(int type, struct list_head *ws,
|
||||
case BTRFS_COMPRESS_NONE:
|
||||
default:
|
||||
/*
|
||||
* This can't happen, the type is validated several times
|
||||
* before we get here. As a sane fallback, return what the
|
||||
* callers will understand as 'no compression happened'.
|
||||
* This can happen when compression races with remount setting
|
||||
* it to 'no compress', while caller doesn't call
|
||||
* inode_need_compress() to check if we really need to
|
||||
* compress.
|
||||
*
|
||||
* Not a big deal, just need to inform caller that we
|
||||
* haven't allocated any pages yet.
|
||||
*/
|
||||
*out_pages = 0;
|
||||
return -E2BIG;
|
||||
}
|
||||
}
|
||||
@ -1611,7 +1616,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
|
||||
curr_sample_pos = 0;
|
||||
while (index < index_end) {
|
||||
page = find_get_page(inode->i_mapping, index);
|
||||
in_data = kmap(page);
|
||||
in_data = kmap_local_page(page);
|
||||
/* Handle case where the start is not aligned to PAGE_SIZE */
|
||||
i = start % PAGE_SIZE;
|
||||
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
|
||||
@ -1624,7 +1629,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
|
||||
start += SAMPLING_INTERVAL;
|
||||
curr_sample_pos += SAMPLING_READ_SIZE;
|
||||
}
|
||||
kunmap(page);
|
||||
kunmap_local(in_data);
|
||||
put_page(page);
|
||||
|
||||
index++;
|
||||
|
982
fs/btrfs/ctree.c
982
fs/btrfs/ctree.c
File diff suppressed because it is too large
Load Diff
@ -342,6 +342,27 @@ struct btrfs_node {
|
||||
struct btrfs_key_ptr ptrs[];
|
||||
} __attribute__ ((__packed__));
|
||||
|
||||
/* Read ahead values for struct btrfs_path.reada */
|
||||
enum {
|
||||
READA_NONE,
|
||||
READA_BACK,
|
||||
READA_FORWARD,
|
||||
/*
|
||||
* Similar to READA_FORWARD but unlike it:
|
||||
*
|
||||
* 1) It will trigger readahead even for leaves that are not close to
|
||||
* each other on disk;
|
||||
* 2) It also triggers readahead for nodes;
|
||||
* 3) During a search, even when a node or leaf is already in memory, it
|
||||
* will still trigger readahead for other nodes and leaves that follow
|
||||
* it.
|
||||
*
|
||||
* This is meant to be used only when we know we are iterating over the
|
||||
* entire tree or a very large part of it.
|
||||
*/
|
||||
READA_FORWARD_ALWAYS,
|
||||
};
|
||||
|
||||
/*
|
||||
* btrfs_paths remember the path taken from the root down to the leaf.
|
||||
* level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
|
||||
@ -350,7 +371,6 @@ struct btrfs_node {
|
||||
* The slots array records the index of the item or block pointer
|
||||
* used while walking the tree.
|
||||
*/
|
||||
enum { READA_NONE, READA_BACK, READA_FORWARD };
|
||||
struct btrfs_path {
|
||||
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
|
||||
int slots[BTRFS_MAX_LEVEL];
|
||||
@ -482,16 +502,6 @@ struct btrfs_discard_ctl {
|
||||
atomic64_t discard_bytes_saved;
|
||||
};
|
||||
|
||||
/* delayed seq elem */
|
||||
struct seq_list {
|
||||
struct list_head list;
|
||||
u64 seq;
|
||||
};
|
||||
|
||||
#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
|
||||
|
||||
#define SEQ_LAST ((u64)-1)
|
||||
|
||||
enum btrfs_orphan_cleanup_state {
|
||||
ORPHAN_CLEANUP_STARTED = 1,
|
||||
ORPHAN_CLEANUP_DONE = 2,
|
||||
@ -572,6 +582,15 @@ enum {
|
||||
|
||||
/* Indicate that we can't trust the free space tree for caching yet */
|
||||
BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
|
||||
|
||||
/* Indicate whether there are any tree modification log users */
|
||||
BTRFS_FS_TREE_MOD_LOG_USERS,
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
/* Indicate if we have error/warn message printed on 32bit systems */
|
||||
BTRFS_FS_32BIT_ERROR,
|
||||
BTRFS_FS_32BIT_WARN,
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
@ -941,10 +960,16 @@ struct btrfs_fs_info {
|
||||
struct work_struct async_data_reclaim_work;
|
||||
struct work_struct preempt_reclaim_work;
|
||||
|
||||
/* Reclaim partially filled block groups in the background */
|
||||
struct work_struct reclaim_bgs_work;
|
||||
struct list_head reclaim_bgs;
|
||||
int bg_reclaim_threshold;
|
||||
|
||||
spinlock_t unused_bgs_lock;
|
||||
struct list_head unused_bgs;
|
||||
struct mutex unused_bg_unpin_mutex;
|
||||
struct mutex delete_unused_bgs_mutex;
|
||||
/* Protect block groups that are going to be deleted */
|
||||
struct mutex reclaim_bgs_lock;
|
||||
|
||||
/* Cached block sizes */
|
||||
u32 nodesize;
|
||||
@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
|
||||
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_ref *generic_ref);
|
||||
|
||||
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
|
||||
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
|
||||
|
||||
/*
|
||||
@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
|
||||
clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
|
||||
}
|
||||
|
||||
/* tree mod log functions from ctree.c */
|
||||
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct seq_list *elem);
|
||||
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct seq_list *elem);
|
||||
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
|
||||
|
||||
/* root-item.c */
|
||||
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
|
||||
u64 ref_id, u64 dirid, u64 sequence, const char *name,
|
||||
@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
|
||||
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
|
||||
int mirror_num, unsigned long bio_flags);
|
||||
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
|
||||
struct page *page, u64 start, u64 end, int mirror);
|
||||
struct page *page, u64 start, u64 end);
|
||||
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
|
||||
u64 start, u64 len);
|
||||
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
||||
@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops;
|
||||
/* Inode locking type flags, by default the exclusive lock is taken */
|
||||
#define BTRFS_ILOCK_SHARED (1U << 0)
|
||||
#define BTRFS_ILOCK_TRY (1U << 1)
|
||||
#define BTRFS_ILOCK_MMAP (1U << 2)
|
||||
|
||||
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
|
||||
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
|
||||
@ -3217,8 +3235,9 @@ extern const struct file_operations btrfs_file_operations;
|
||||
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root, struct btrfs_inode *inode,
|
||||
struct btrfs_drop_extents_args *args);
|
||||
int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
const u64 start, const u64 end,
|
||||
int btrfs_replace_file_extents(struct btrfs_inode *inode,
|
||||
struct btrfs_path *path, const u64 start,
|
||||
const u64 end,
|
||||
struct btrfs_replace_extent_info *extent_info,
|
||||
struct btrfs_trans_handle **trans_out);
|
||||
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
|
||||
@ -3405,6 +3424,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
|
||||
#define ASSERT(expr) (void)(expr)
|
||||
#endif
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
|
||||
/*
|
||||
* The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
|
||||
* addresses of extents.
|
||||
*
|
||||
* For 4K page size it's about 10T, for 64K it's 160T.
|
||||
*/
|
||||
#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
|
||||
void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
|
||||
void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Get the correct offset inside the page of extent buffer.
|
||||
*
|
||||
@ -3732,8 +3764,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
|
||||
return signal_pending(current);
|
||||
}
|
||||
|
||||
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
|
||||
|
||||
/* Sanity test specific functions */
|
||||
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
||||
void btrfs_test_destroy_inode(struct inode *inode);
|
||||
|
@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
|
||||
static int btrfs_delayed_inode_reserve_metadata(
|
||||
struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root,
|
||||
struct btrfs_inode *inode,
|
||||
struct btrfs_delayed_node *node)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata(
|
||||
return ret;
|
||||
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
|
||||
BTRFS_RESERVE_NO_FLUSH);
|
||||
/*
|
||||
* Since we're under a transaction reserve_metadata_bytes could
|
||||
* try to commit the transaction which will make it return
|
||||
* EAGAIN to make us stop the transaction we have, so return
|
||||
* ENOSPC instead so that btrfs_dirty_inode knows what to do.
|
||||
*/
|
||||
if (ret == -EAGAIN) {
|
||||
ret = -ENOSPC;
|
||||
/* NO_FLUSH could only fail with -ENOSPC */
|
||||
ASSERT(ret == 0 || ret == -ENOSPC);
|
||||
if (ret)
|
||||
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
|
||||
}
|
||||
if (!ret) {
|
||||
node->bytes_reserved = num_bytes;
|
||||
trace_btrfs_space_reservation(fs_info,
|
||||
"delayed_inode",
|
||||
btrfs_ino(inode),
|
||||
num_bytes, 1);
|
||||
} else {
|
||||
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
|
||||
}
|
||||
return ret;
|
||||
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
|
||||
}
|
||||
|
||||
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
|
||||
if (!ret) {
|
||||
trace_btrfs_space_reservation(fs_info, "delayed_inode",
|
||||
btrfs_ino(inode), num_bytes, 1);
|
||||
node->inode_id, num_bytes, 1);
|
||||
node->bytes_reserved = num_bytes;
|
||||
}
|
||||
|
||||
@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
|
||||
* We can only do one readdir with delayed items at a time because of
|
||||
* item->readdir_list.
|
||||
*/
|
||||
inode_unlock_shared(inode);
|
||||
inode_lock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
btrfs_inode_lock(inode, 0);
|
||||
|
||||
mutex_lock(&delayed_node->mutex);
|
||||
item = __btrfs_first_delayed_insertion_item(delayed_node);
|
||||
@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
|
||||
goto release_node;
|
||||
}
|
||||
|
||||
ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
|
||||
delayed_node);
|
||||
ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
|
||||
if (ret)
|
||||
goto release_node;
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "transaction.h"
|
||||
#include "qgroup.h"
|
||||
#include "space-info.h"
|
||||
#include "tree-mod-log.h"
|
||||
|
||||
struct kmem_cache *btrfs_delayed_ref_head_cachep;
|
||||
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
|
||||
@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
|
||||
if (head->is_data)
|
||||
return;
|
||||
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
if (!list_empty(&fs_info->tree_mod_seq_list)) {
|
||||
struct seq_list *elem;
|
||||
|
||||
elem = list_first_entry(&fs_info->tree_mod_seq_list,
|
||||
struct seq_list, list);
|
||||
seq = elem->seq;
|
||||
}
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
|
||||
seq = btrfs_tree_mod_log_lowest_seq(fs_info);
|
||||
again:
|
||||
for (node = rb_first_cached(&head->ref_tree); node;
|
||||
node = rb_next(node)) {
|
||||
@ -517,23 +509,16 @@ again:
|
||||
|
||||
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
|
||||
{
|
||||
struct seq_list *elem;
|
||||
int ret = 0;
|
||||
u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
|
||||
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
if (!list_empty(&fs_info->tree_mod_seq_list)) {
|
||||
elem = list_first_entry(&fs_info->tree_mod_seq_list,
|
||||
struct seq_list, list);
|
||||
if (seq >= elem->seq) {
|
||||
if (min_seq != 0 && seq >= min_seq) {
|
||||
btrfs_debug(fs_info,
|
||||
"holding back delayed_ref %#x.%x, lowest is %#x.%x",
|
||||
(u32)(seq >> 32), (u32)seq,
|
||||
(u32)(elem->seq >> 32), (u32)elem->seq);
|
||||
"holding back delayed_ref %llu, lowest is %llu",
|
||||
seq, min_seq);
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "discard.h"
|
||||
#include "space-info.h"
|
||||
#include "zoned.h"
|
||||
#include "subpage.h"
|
||||
|
||||
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
|
||||
BTRFS_HEADER_FLAG_RELOC |\
|
||||
@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int csum_one_extent_buffer(struct extent_buffer *eb)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = eb->fs_info;
|
||||
u8 result[BTRFS_CSUM_SIZE];
|
||||
int ret;
|
||||
|
||||
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
|
||||
offsetof(struct btrfs_header, fsid),
|
||||
BTRFS_FSID_SIZE) == 0);
|
||||
csum_tree_block(eb, result);
|
||||
|
||||
if (btrfs_header_level(eb))
|
||||
ret = btrfs_check_node(eb);
|
||||
else
|
||||
ret = btrfs_check_leaf_full(eb);
|
||||
|
||||
if (ret < 0) {
|
||||
btrfs_print_tree(eb, 0);
|
||||
btrfs_err(fs_info,
|
||||
"block=%llu write time tree block corruption detected",
|
||||
eb->start);
|
||||
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
|
||||
return ret;
|
||||
}
|
||||
write_extent_buffer(eb, result, 0, fs_info->csum_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Checksum all dirty extent buffers in one bio_vec */
|
||||
static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
|
||||
struct bio_vec *bvec)
|
||||
{
|
||||
struct page *page = bvec->bv_page;
|
||||
u64 bvec_start = page_offset(page) + bvec->bv_offset;
|
||||
u64 cur;
|
||||
int ret = 0;
|
||||
|
||||
for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
|
||||
cur += fs_info->nodesize) {
|
||||
struct extent_buffer *eb;
|
||||
bool uptodate;
|
||||
|
||||
eb = find_extent_buffer(fs_info, cur);
|
||||
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
|
||||
fs_info->nodesize);
|
||||
|
||||
/* A dirty eb shouldn't disappear from buffer_radix */
|
||||
if (WARN_ON(!eb))
|
||||
return -EUCLEAN;
|
||||
|
||||
if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
|
||||
free_extent_buffer(eb);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
if (WARN_ON(!uptodate)) {
|
||||
free_extent_buffer(eb);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
ret = csum_one_extent_buffer(eb);
|
||||
free_extent_buffer(eb);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checksum a dirty tree block before IO. This has extra checks to make sure
|
||||
* we only fill in the checksum field in the first page of a multi-page block.
|
||||
@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
|
||||
struct page *page = bvec->bv_page;
|
||||
u64 start = page_offset(page);
|
||||
u64 found_start;
|
||||
u8 result[BTRFS_CSUM_SIZE];
|
||||
struct extent_buffer *eb;
|
||||
int ret;
|
||||
|
||||
if (fs_info->sectorsize < PAGE_SIZE)
|
||||
return csum_dirty_subpage_buffers(fs_info, bvec);
|
||||
|
||||
eb = (struct extent_buffer *)page->private;
|
||||
if (page != eb->pages[0])
|
||||
@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
|
||||
if (WARN_ON(!PageUptodate(page)))
|
||||
return -EUCLEAN;
|
||||
|
||||
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
|
||||
offsetof(struct btrfs_header, fsid),
|
||||
BTRFS_FSID_SIZE) == 0);
|
||||
|
||||
csum_tree_block(eb, result);
|
||||
|
||||
if (btrfs_header_level(eb))
|
||||
ret = btrfs_check_node(eb);
|
||||
else
|
||||
ret = btrfs_check_leaf_full(eb);
|
||||
|
||||
if (ret < 0) {
|
||||
btrfs_print_tree(eb, 0);
|
||||
btrfs_err(fs_info,
|
||||
"block=%llu write time tree block corruption detected",
|
||||
eb->start);
|
||||
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
|
||||
return ret;
|
||||
}
|
||||
write_extent_buffer(eb, result, 0, fs_info->csum_size);
|
||||
|
||||
return 0;
|
||||
return csum_one_extent_buffer(eb);
|
||||
}
|
||||
|
||||
static int check_tree_block_fsid(struct extent_buffer *eb)
|
||||
@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
|
||||
static int btree_set_page_dirty(struct page *page)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
|
||||
struct btrfs_subpage *subpage;
|
||||
struct extent_buffer *eb;
|
||||
int cur_bit = 0;
|
||||
u64 page_start = page_offset(page);
|
||||
|
||||
if (fs_info->sectorsize == PAGE_SIZE) {
|
||||
BUG_ON(!PagePrivate(page));
|
||||
eb = (struct extent_buffer *)page->private;
|
||||
BUG_ON(!eb);
|
||||
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
|
||||
BUG_ON(!atomic_read(&eb->refs));
|
||||
btrfs_assert_tree_locked(eb);
|
||||
return __set_page_dirty_nobuffers(page);
|
||||
}
|
||||
ASSERT(PagePrivate(page) && page->private);
|
||||
subpage = (struct btrfs_subpage *)page->private;
|
||||
|
||||
ASSERT(subpage->dirty_bitmap);
|
||||
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
|
||||
unsigned long flags;
|
||||
u64 cur;
|
||||
u16 tmp = (1 << cur_bit);
|
||||
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
if (!(tmp & subpage->dirty_bitmap)) {
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
cur_bit++;
|
||||
continue;
|
||||
}
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
cur = page_start + cur_bit * fs_info->sectorsize;
|
||||
|
||||
eb = find_extent_buffer(fs_info, cur);
|
||||
ASSERT(eb);
|
||||
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
|
||||
ASSERT(atomic_read(&eb->refs));
|
||||
btrfs_assert_tree_locked(eb);
|
||||
free_extent_buffer(eb);
|
||||
|
||||
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
|
||||
}
|
||||
#endif
|
||||
return __set_page_dirty_nobuffers(page);
|
||||
}
|
||||
@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg)
|
||||
btrfs_run_defrag_inodes(fs_info);
|
||||
|
||||
/*
|
||||
* Acquires fs_info->delete_unused_bgs_mutex to avoid racing
|
||||
* Acquires fs_info->reclaim_bgs_lock to avoid racing
|
||||
* with relocation (btrfs_relocate_chunk) and relocation
|
||||
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
|
||||
* after acquiring fs_info->delete_unused_bgs_mutex. So we
|
||||
* after acquiring fs_info->reclaim_bgs_lock. So we
|
||||
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
|
||||
* unused block groups.
|
||||
*/
|
||||
btrfs_delete_unused_bgs(fs_info);
|
||||
|
||||
/*
|
||||
* Reclaim block groups in the reclaim_bgs list after we deleted
|
||||
* all unused block_groups. This possibly gives us some more free
|
||||
* space.
|
||||
*/
|
||||
btrfs_reclaim_bgs(fs_info);
|
||||
sleep:
|
||||
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
|
||||
if (kthread_should_park())
|
||||
@ -2793,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
|
||||
spin_lock_init(&fs_info->treelog_bg_lock);
|
||||
rwlock_init(&fs_info->tree_mod_log_lock);
|
||||
mutex_init(&fs_info->unused_bg_unpin_mutex);
|
||||
mutex_init(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_init(&fs_info->reclaim_bgs_lock);
|
||||
mutex_init(&fs_info->reloc_mutex);
|
||||
mutex_init(&fs_info->delalloc_root_mutex);
|
||||
mutex_init(&fs_info->zoned_meta_io_lock);
|
||||
@ -2803,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
|
||||
INIT_LIST_HEAD(&fs_info->space_info);
|
||||
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
|
||||
INIT_LIST_HEAD(&fs_info->unused_bgs);
|
||||
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
INIT_LIST_HEAD(&fs_info->allocated_roots);
|
||||
INIT_LIST_HEAD(&fs_info->allocated_ebs);
|
||||
@ -2891,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
|
||||
fs_info->swapfile_pins = RB_ROOT;
|
||||
|
||||
fs_info->send_in_progress = 0;
|
||||
|
||||
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
|
||||
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
|
||||
}
|
||||
|
||||
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
|
||||
@ -4249,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
|
||||
cancel_work_sync(&fs_info->async_data_reclaim_work);
|
||||
cancel_work_sync(&fs_info->preempt_reclaim_work);
|
||||
|
||||
cancel_work_sync(&fs_info->reclaim_bgs_work);
|
||||
|
||||
/* Cancel or finish ongoing discard work */
|
||||
btrfs_discard_cleanup(fs_info);
|
||||
|
||||
|
@ -2490,19 +2490,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
||||
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
|
||||
}
|
||||
|
||||
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
|
||||
{
|
||||
struct btrfs_block_group *block_group;
|
||||
int readonly = 0;
|
||||
|
||||
block_group = btrfs_lookup_block_group(fs_info, bytenr);
|
||||
if (!block_group || block_group->ro)
|
||||
readonly = 1;
|
||||
if (block_group)
|
||||
btrfs_put_block_group(block_group);
|
||||
return readonly;
|
||||
}
|
||||
|
||||
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
@ -3355,11 +3342,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
|
||||
* find a node pointing to this leaf and record operations that
|
||||
* point to this leaf.
|
||||
*/
|
||||
if (btrfs_header_level(buf) == 0) {
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
must_pin = !list_empty(&fs_info->tree_mod_seq_list);
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
}
|
||||
if (btrfs_header_level(buf) == 0 &&
|
||||
test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
|
||||
must_pin = true;
|
||||
|
||||
if (must_pin || btrfs_is_zoned(fs_info)) {
|
||||
btrfs_redirty_list_add(trans->transaction, buf);
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/cleancache.h>
|
||||
#include "misc.h"
|
||||
#include "extent_io.h"
|
||||
#include "extent-io-tree.h"
|
||||
#include "extent_map.h"
|
||||
@ -2983,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio)
|
||||
if (likely(uptodate)) {
|
||||
if (is_data_inode(inode))
|
||||
ret = btrfs_verify_data_csum(io_bio,
|
||||
bio_offset, page, start, end,
|
||||
mirror);
|
||||
bio_offset, page, start, end);
|
||||
else
|
||||
ret = btrfs_validate_metadata_buffer(io_bio,
|
||||
page, start, end, mirror);
|
||||
@ -3967,7 +3967,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
|
||||
|
||||
btrfs_tree_unlock(eb);
|
||||
|
||||
if (!ret)
|
||||
/*
|
||||
* Either we don't need to submit any tree block, or we're submitting
|
||||
* subpage eb.
|
||||
* Subpage metadata doesn't use page locking at all, so we can skip
|
||||
* the page locking.
|
||||
*/
|
||||
if (!ret || fs_info->sectorsize < PAGE_SIZE)
|
||||
return ret;
|
||||
|
||||
num_pages = num_extent_pages(eb);
|
||||
@ -4012,12 +4018,11 @@ err_unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void set_btree_ioerr(struct page *page)
|
||||
static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
|
||||
{
|
||||
struct extent_buffer *eb = (struct extent_buffer *)page->private;
|
||||
struct btrfs_fs_info *fs_info;
|
||||
struct btrfs_fs_info *fs_info = eb->fs_info;
|
||||
|
||||
SetPageError(page);
|
||||
btrfs_page_set_error(fs_info, page, eb->start, eb->len);
|
||||
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
|
||||
return;
|
||||
|
||||
@ -4025,7 +4030,6 @@ static void set_btree_ioerr(struct page *page)
|
||||
* If we error out, we should add back the dirty_metadata_bytes
|
||||
* to make it consistent.
|
||||
*/
|
||||
fs_info = eb->fs_info;
|
||||
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
|
||||
eb->len, fs_info->dirty_metadata_batch);
|
||||
|
||||
@ -4069,26 +4073,111 @@ static void set_btree_ioerr(struct page *page)
|
||||
*/
|
||||
switch (eb->log_index) {
|
||||
case -1:
|
||||
set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
|
||||
set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
|
||||
break;
|
||||
case 0:
|
||||
set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
|
||||
set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
|
||||
break;
|
||||
case 1:
|
||||
set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
|
||||
set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
|
||||
break;
|
||||
default:
|
||||
BUG(); /* unexpected, logic error */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The endio specific version which won't touch any unsafe spinlock in endio
|
||||
* context.
|
||||
*/
|
||||
static struct extent_buffer *find_extent_buffer_nolock(
|
||||
struct btrfs_fs_info *fs_info, u64 start)
|
||||
{
|
||||
struct extent_buffer *eb;
|
||||
|
||||
rcu_read_lock();
|
||||
eb = radix_tree_lookup(&fs_info->buffer_radix,
|
||||
start >> fs_info->sectorsize_bits);
|
||||
if (eb && atomic_inc_not_zero(&eb->refs)) {
|
||||
rcu_read_unlock();
|
||||
return eb;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* The endio function for subpage extent buffer write.
|
||||
*
|
||||
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
|
||||
* after all extent buffers in the page has finished their writeback.
|
||||
*/
|
||||
static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
u64 bvec_start = page_offset(page) + bvec->bv_offset;
|
||||
u64 bvec_end = bvec_start + bvec->bv_len - 1;
|
||||
u64 cur_bytenr = bvec_start;
|
||||
|
||||
ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
|
||||
|
||||
/* Iterate through all extent buffers in the range */
|
||||
while (cur_bytenr <= bvec_end) {
|
||||
struct extent_buffer *eb;
|
||||
int done;
|
||||
|
||||
/*
|
||||
* Here we can't use find_extent_buffer(), as it may
|
||||
* try to lock eb->refs_lock, which is not safe in endio
|
||||
* context.
|
||||
*/
|
||||
eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
|
||||
ASSERT(eb);
|
||||
|
||||
cur_bytenr = eb->start + eb->len;
|
||||
|
||||
ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
|
||||
done = atomic_dec_and_test(&eb->io_pages);
|
||||
ASSERT(done);
|
||||
|
||||
if (bio->bi_status ||
|
||||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
|
||||
ClearPageUptodate(page);
|
||||
set_btree_ioerr(page, eb);
|
||||
}
|
||||
|
||||
btrfs_subpage_clear_writeback(fs_info, page, eb->start,
|
||||
eb->len);
|
||||
end_extent_buffer_writeback(eb);
|
||||
/*
|
||||
* free_extent_buffer() will grab spinlock which is not
|
||||
* safe in endio context. Thus here we manually dec
|
||||
* the ref.
|
||||
*/
|
||||
atomic_dec(&eb->refs);
|
||||
}
|
||||
}
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static void end_bio_extent_buffer_writepage(struct bio *bio)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info;
|
||||
struct bio_vec *bvec;
|
||||
struct extent_buffer *eb;
|
||||
int done;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
|
||||
if (fs_info->sectorsize < PAGE_SIZE)
|
||||
return end_bio_subpage_eb_writepage(fs_info, bio);
|
||||
|
||||
ASSERT(!bio_flagged(bio, BIO_CLONED));
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
struct page *page = bvec->bv_page;
|
||||
@ -4100,7 +4189,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
|
||||
if (bio->bi_status ||
|
||||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
|
||||
ClearPageUptodate(page);
|
||||
set_btree_ioerr(page);
|
||||
set_btree_ioerr(page, eb);
|
||||
}
|
||||
|
||||
end_page_writeback(page);
|
||||
@ -4114,6 +4203,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlike the work in write_one_eb(), we rely completely on extent locking.
|
||||
* Page locking is only utilized at minimum to keep the VMM code happy.
|
||||
*
|
||||
* Caller should still call write_one_eb() other than this function directly.
|
||||
* As write_one_eb() has extra preparation before submitting the extent buffer.
|
||||
*/
|
||||
static int write_one_subpage_eb(struct extent_buffer *eb,
|
||||
struct writeback_control *wbc,
|
||||
struct extent_page_data *epd)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = eb->fs_info;
|
||||
struct page *page = eb->pages[0];
|
||||
unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
|
||||
bool no_dirty_ebs = false;
|
||||
int ret;
|
||||
|
||||
/* clear_page_dirty_for_io() in subpage helper needs page locked */
|
||||
lock_page(page);
|
||||
btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
|
||||
|
||||
/* Check if this is the last dirty bit to update nr_written */
|
||||
no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
|
||||
eb->start, eb->len);
|
||||
if (no_dirty_ebs)
|
||||
clear_page_dirty_for_io(page);
|
||||
|
||||
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
|
||||
eb->start, eb->len, eb->start - page_offset(page),
|
||||
&epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
|
||||
false);
|
||||
if (ret) {
|
||||
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
|
||||
set_btree_ioerr(page, eb);
|
||||
unlock_page(page);
|
||||
|
||||
if (atomic_dec_and_test(&eb->io_pages))
|
||||
end_extent_buffer_writeback(eb);
|
||||
return -EIO;
|
||||
}
|
||||
unlock_page(page);
|
||||
/*
|
||||
* Submission finished without problem, if no range of the page is
|
||||
* dirty anymore, we have submitted a page. Update nr_written in wbc.
|
||||
*/
|
||||
if (no_dirty_ebs)
|
||||
update_nr_written(wbc, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
||||
struct writeback_control *wbc,
|
||||
struct extent_page_data *epd)
|
||||
@ -4145,6 +4284,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
||||
memzero_extent_buffer(eb, start, end - start);
|
||||
}
|
||||
|
||||
if (eb->fs_info->sectorsize < PAGE_SIZE)
|
||||
return write_one_subpage_eb(eb, wbc, epd);
|
||||
|
||||
for (i = 0; i < num_pages; i++) {
|
||||
struct page *p = eb->pages[i];
|
||||
|
||||
@ -4156,7 +4298,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
||||
end_bio_extent_buffer_writepage,
|
||||
0, 0, 0, false);
|
||||
if (ret) {
|
||||
set_btree_ioerr(p);
|
||||
set_btree_ioerr(p, eb);
|
||||
if (PageWriteback(p))
|
||||
end_page_writeback(p);
|
||||
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
|
||||
@ -4180,6 +4322,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit one subpage btree page.
|
||||
*
|
||||
* The main difference to submit_eb_page() is:
|
||||
* - Page locking
|
||||
* For subpage, we don't rely on page locking at all.
|
||||
*
|
||||
* - Flush write bio
|
||||
* We only flush bio if we may be unable to fit current extent buffers into
|
||||
* current bio.
|
||||
*
|
||||
* Return >=0 for the number of submitted extent buffers.
|
||||
* Return <0 for fatal error.
|
||||
*/
|
||||
static int submit_eb_subpage(struct page *page,
|
||||
struct writeback_control *wbc,
|
||||
struct extent_page_data *epd)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
|
||||
int submitted = 0;
|
||||
u64 page_start = page_offset(page);
|
||||
int bit_start = 0;
|
||||
const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
|
||||
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
|
||||
int ret;
|
||||
|
||||
/* Lock and write each dirty extent buffers in the range */
|
||||
while (bit_start < nbits) {
|
||||
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
|
||||
struct extent_buffer *eb;
|
||||
unsigned long flags;
|
||||
u64 start;
|
||||
|
||||
/*
|
||||
* Take private lock to ensure the subpage won't be detached
|
||||
* in the meantime.
|
||||
*/
|
||||
spin_lock(&page->mapping->private_lock);
|
||||
if (!PagePrivate(page)) {
|
||||
spin_unlock(&page->mapping->private_lock);
|
||||
break;
|
||||
}
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
if (!((1 << bit_start) & subpage->dirty_bitmap)) {
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
spin_unlock(&page->mapping->private_lock);
|
||||
bit_start++;
|
||||
continue;
|
||||
}
|
||||
|
||||
start = page_start + bit_start * fs_info->sectorsize;
|
||||
bit_start += sectors_per_node;
|
||||
|
||||
/*
|
||||
* Here we just want to grab the eb without touching extra
|
||||
* spin locks, so call find_extent_buffer_nolock().
|
||||
*/
|
||||
eb = find_extent_buffer_nolock(fs_info, start);
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
spin_unlock(&page->mapping->private_lock);
|
||||
|
||||
/*
|
||||
* The eb has already reached 0 refs thus find_extent_buffer()
|
||||
* doesn't return it. We don't need to write back such eb
|
||||
* anyway.
|
||||
*/
|
||||
if (!eb)
|
||||
continue;
|
||||
|
||||
ret = lock_extent_buffer_for_io(eb, epd);
|
||||
if (ret == 0) {
|
||||
free_extent_buffer(eb);
|
||||
continue;
|
||||
}
|
||||
if (ret < 0) {
|
||||
free_extent_buffer(eb);
|
||||
goto cleanup;
|
||||
}
|
||||
ret = write_one_eb(eb, wbc, epd);
|
||||
free_extent_buffer(eb);
|
||||
if (ret < 0)
|
||||
goto cleanup;
|
||||
submitted++;
|
||||
}
|
||||
return submitted;
|
||||
|
||||
cleanup:
|
||||
/* We hit error, end bio for the submitted extent buffers */
|
||||
end_write_bio(epd, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Submit all page(s) of one extent buffer.
|
||||
*
|
||||
@ -4212,6 +4446,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
|
||||
if (!PagePrivate(page))
|
||||
return 0;
|
||||
|
||||
if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
|
||||
return submit_eb_subpage(page, wbc, epd);
|
||||
|
||||
spin_lock(&mapping->private_lock);
|
||||
if (!PagePrivate(page)) {
|
||||
spin_unlock(&mapping->private_lock);
|
||||
@ -4652,10 +4889,8 @@ void extent_readahead(struct readahead_control *rac)
|
||||
int nr;
|
||||
|
||||
while ((nr = readahead_page_batch(rac, pagepool))) {
|
||||
u64 contig_start = page_offset(pagepool[0]);
|
||||
u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
|
||||
|
||||
ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
|
||||
u64 contig_start = readahead_pos(rac);
|
||||
u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
|
||||
|
||||
contiguous_readpages(pagepool, nr, contig_start, contig_end,
|
||||
&em_cached, &bio, &bio_flags, &prev_em_start);
|
||||
@ -5469,25 +5704,21 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
|
||||
{
|
||||
struct extent_buffer *eb;
|
||||
|
||||
rcu_read_lock();
|
||||
eb = radix_tree_lookup(&fs_info->buffer_radix,
|
||||
start >> fs_info->sectorsize_bits);
|
||||
if (eb && atomic_inc_not_zero(&eb->refs)) {
|
||||
rcu_read_unlock();
|
||||
eb = find_extent_buffer_nolock(fs_info, start);
|
||||
if (!eb)
|
||||
return NULL;
|
||||
/*
|
||||
* Lock our eb's refs_lock to avoid races with
|
||||
* free_extent_buffer. When we get our eb it might be flagged
|
||||
* with EXTENT_BUFFER_STALE and another task running
|
||||
* free_extent_buffer might have seen that flag set,
|
||||
* eb->refs == 2, that the buffer isn't under IO (dirty and
|
||||
* Lock our eb's refs_lock to avoid races with free_extent_buffer().
|
||||
* When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
|
||||
* another task running free_extent_buffer() might have seen that flag
|
||||
* set, eb->refs == 2, that the buffer isn't under IO (dirty and
|
||||
* writeback flags not set) and it's still in the tree (flag
|
||||
* EXTENT_BUFFER_TREE_REF set), therefore being in the process
|
||||
* of decrementing the extent buffer's reference count twice.
|
||||
* So here we could race and increment the eb's reference count,
|
||||
* clear its stale flag, mark it as dirty and drop our reference
|
||||
* before the other task finishes executing free_extent_buffer,
|
||||
* which would later result in an attempt to free an extent
|
||||
* buffer that is dirty.
|
||||
* EXTENT_BUFFER_TREE_REF set), therefore being in the process of
|
||||
* decrementing the extent buffer's reference count twice. So here we
|
||||
* could race and increment the eb's reference count, clear its stale
|
||||
* flag, mark it as dirty and drop our reference before the other task
|
||||
* finishes executing free_extent_buffer, which would later result in
|
||||
* an attempt to free an extent buffer that is dirty.
|
||||
*/
|
||||
if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
|
||||
spin_lock(&eb->refs_lock);
|
||||
@ -5495,10 +5726,6 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
|
||||
}
|
||||
mark_extent_buffer_accessed(eb, NULL);
|
||||
return eb;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
||||
@ -5594,6 +5821,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
if (start >= MAX_LFS_FILESIZE) {
|
||||
btrfs_err_rl(fs_info,
|
||||
"extent buffer %llu is beyond 32bit page cache limit", start);
|
||||
btrfs_err_32bit_limit(fs_info);
|
||||
return ERR_PTR(-EOVERFLOW);
|
||||
}
|
||||
if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
|
||||
btrfs_warn_32bit_limit(fs_info);
|
||||
#endif
|
||||
|
||||
if (fs_info->sectorsize < PAGE_SIZE &&
|
||||
offset_in_page(start) + len > PAGE_SIZE) {
|
||||
btrfs_err(fs_info,
|
||||
@ -5665,7 +5903,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
|
||||
btrfs_page_inc_eb_refs(fs_info, p);
|
||||
spin_unlock(&mapping->private_lock);
|
||||
|
||||
WARN_ON(PageDirty(p));
|
||||
WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
|
||||
eb->pages[i] = p;
|
||||
if (!PageUptodate(p))
|
||||
uptodate = 0;
|
||||
@ -5814,28 +6052,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
|
||||
release_extent_buffer(eb);
|
||||
}
|
||||
|
||||
static void btree_clear_page_dirty(struct page *page)
|
||||
{
|
||||
ASSERT(PageDirty(page));
|
||||
ASSERT(PageLocked(page));
|
||||
clear_page_dirty_for_io(page);
|
||||
xa_lock_irq(&page->mapping->i_pages);
|
||||
if (!PageDirty(page))
|
||||
__xa_clear_mark(&page->mapping->i_pages,
|
||||
page_index(page), PAGECACHE_TAG_DIRTY);
|
||||
xa_unlock_irq(&page->mapping->i_pages);
|
||||
}
|
||||
|
||||
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = eb->fs_info;
|
||||
struct page *page = eb->pages[0];
|
||||
bool last;
|
||||
|
||||
/* btree_clear_page_dirty() needs page locked */
|
||||
lock_page(page);
|
||||
last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
|
||||
eb->len);
|
||||
if (last)
|
||||
btree_clear_page_dirty(page);
|
||||
unlock_page(page);
|
||||
WARN_ON(atomic_read(&eb->refs) == 0);
|
||||
}
|
||||
|
||||
void clear_extent_buffer_dirty(const struct extent_buffer *eb)
|
||||
{
|
||||
int i;
|
||||
int num_pages;
|
||||
struct page *page;
|
||||
|
||||
if (eb->fs_info->sectorsize < PAGE_SIZE)
|
||||
return clear_subpage_extent_buffer_dirty(eb);
|
||||
|
||||
num_pages = num_extent_pages(eb);
|
||||
|
||||
for (i = 0; i < num_pages; i++) {
|
||||
page = eb->pages[i];
|
||||
if (!PageDirty(page))
|
||||
continue;
|
||||
|
||||
lock_page(page);
|
||||
WARN_ON(!PagePrivate(page));
|
||||
|
||||
clear_page_dirty_for_io(page);
|
||||
xa_lock_irq(&page->mapping->i_pages);
|
||||
if (!PageDirty(page))
|
||||
__xa_clear_mark(&page->mapping->i_pages,
|
||||
page_index(page), PAGECACHE_TAG_DIRTY);
|
||||
xa_unlock_irq(&page->mapping->i_pages);
|
||||
btree_clear_page_dirty(page);
|
||||
ClearPageError(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
@ -5856,10 +6117,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
|
||||
WARN_ON(atomic_read(&eb->refs) == 0);
|
||||
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
|
||||
|
||||
if (!was_dirty)
|
||||
for (i = 0; i < num_pages; i++)
|
||||
set_page_dirty(eb->pages[i]);
|
||||
if (!was_dirty) {
|
||||
bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
|
||||
|
||||
/*
|
||||
* For subpage case, we can have other extent buffers in the
|
||||
* same page, and in clear_subpage_extent_buffer_dirty() we
|
||||
* have to clear page dirty without subpage lock held.
|
||||
* This can cause race where our page gets dirty cleared after
|
||||
* we just set it.
|
||||
*
|
||||
* Thankfully, clear_subpage_extent_buffer_dirty() has locked
|
||||
* its page for other reasons, we can use page lock to prevent
|
||||
* the above race.
|
||||
*/
|
||||
if (subpage)
|
||||
lock_page(eb->pages[0]);
|
||||
for (i = 0; i < num_pages; i++)
|
||||
btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
|
||||
eb->start, eb->len);
|
||||
if (subpage)
|
||||
unlock_page(eb->pages[0]);
|
||||
}
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
for (i = 0; i < num_pages; i++)
|
||||
ASSERT(PageDirty(eb->pages[i]));
|
||||
@ -6217,12 +6496,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that the extent buffer is uptodate.
|
||||
*
|
||||
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
|
||||
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
|
||||
*/
|
||||
static void assert_eb_page_uptodate(const struct extent_buffer *eb,
|
||||
struct page *page)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = eb->fs_info;
|
||||
|
||||
if (fs_info->sectorsize < PAGE_SIZE) {
|
||||
bool uptodate;
|
||||
|
||||
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
|
||||
eb->start, eb->len);
|
||||
WARN_ON(!uptodate);
|
||||
} else {
|
||||
WARN_ON(!PageUptodate(page));
|
||||
}
|
||||
}
|
||||
|
||||
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
|
||||
const void *srcv)
|
||||
{
|
||||
char *kaddr;
|
||||
|
||||
WARN_ON(!PageUptodate(eb->pages[0]));
|
||||
assert_eb_page_uptodate(eb, eb->pages[0]);
|
||||
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
|
||||
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
|
||||
BTRFS_FSID_SIZE);
|
||||
@ -6232,7 +6533,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
|
||||
{
|
||||
char *kaddr;
|
||||
|
||||
WARN_ON(!PageUptodate(eb->pages[0]));
|
||||
assert_eb_page_uptodate(eb, eb->pages[0]);
|
||||
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
|
||||
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
|
||||
BTRFS_FSID_SIZE);
|
||||
@ -6257,7 +6558,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
|
||||
|
||||
while (len > 0) {
|
||||
page = eb->pages[i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
|
||||
cur = min(len, PAGE_SIZE - offset);
|
||||
kaddr = page_address(page);
|
||||
@ -6286,7 +6587,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
|
||||
|
||||
while (len > 0) {
|
||||
page = eb->pages[i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
|
||||
cur = min(len, PAGE_SIZE - offset);
|
||||
kaddr = page_address(page);
|
||||
@ -6344,7 +6645,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
|
||||
|
||||
while (len > 0) {
|
||||
page = dst->pages[i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(dst, page);
|
||||
|
||||
cur = min(len, (unsigned long)(PAGE_SIZE - offset));
|
||||
|
||||
@ -6406,7 +6707,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
|
||||
|
||||
eb_bitmap_offset(eb, start, nr, &i, &offset);
|
||||
page = eb->pages[i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
kaddr = page_address(page);
|
||||
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
|
||||
}
|
||||
@ -6431,7 +6732,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
|
||||
|
||||
eb_bitmap_offset(eb, start, pos, &i, &offset);
|
||||
page = eb->pages[i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
kaddr = page_address(page);
|
||||
|
||||
while (len >= bits_to_set) {
|
||||
@ -6442,7 +6743,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
|
||||
if (++offset >= PAGE_SIZE && len > 0) {
|
||||
offset = 0;
|
||||
page = eb->pages[++i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
kaddr = page_address(page);
|
||||
}
|
||||
}
|
||||
@ -6474,7 +6775,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
|
||||
|
||||
eb_bitmap_offset(eb, start, pos, &i, &offset);
|
||||
page = eb->pages[i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
kaddr = page_address(page);
|
||||
|
||||
while (len >= bits_to_clear) {
|
||||
@ -6485,7 +6786,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
|
||||
if (++offset >= PAGE_SIZE && len > 0) {
|
||||
offset = 0;
|
||||
page = eb->pages[++i];
|
||||
WARN_ON(!PageUptodate(page));
|
||||
assert_eb_page_uptodate(eb, page);
|
||||
kaddr = page_address(page);
|
||||
}
|
||||
}
|
||||
|
@ -66,6 +66,7 @@ enum {
|
||||
struct btrfs_root;
|
||||
struct btrfs_inode;
|
||||
struct btrfs_io_bio;
|
||||
struct btrfs_fs_info;
|
||||
struct io_failure_record;
|
||||
struct extent_io_tree;
|
||||
|
||||
@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
|
||||
struct bio *btrfs_bio_clone(struct bio *bio);
|
||||
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
|
||||
|
||||
struct btrfs_fs_info;
|
||||
struct btrfs_inode;
|
||||
|
||||
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
||||
u64 length, u64 logical, struct page *page,
|
||||
unsigned int pg_offset, int mirror_num);
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <crypto/hash.h>
|
||||
#include "misc.h"
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "transaction.h"
|
||||
|
114
fs/btrfs/file.c
114
fs/btrfs/file.c
@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
|
||||
else
|
||||
num_written = btrfs_buffered_write(iocb, from);
|
||||
|
||||
/*
|
||||
* We also have to set last_sub_trans to the current log transid,
|
||||
* otherwise subsequent syncs to a file that's been synced in this
|
||||
* transaction will appear to have already occurred.
|
||||
*/
|
||||
spin_lock(&inode->lock);
|
||||
inode->last_sub_trans = inode->root->log_transid;
|
||||
spin_unlock(&inode->lock);
|
||||
btrfs_set_inode_last_sub_trans(inode);
|
||||
|
||||
if (num_written > 0)
|
||||
num_written = generic_write_sync(iocb, num_written);
|
||||
|
||||
@ -2122,7 +2116,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
|
||||
|
||||
atomic_inc(&root->log_batch);
|
||||
|
||||
@ -2135,11 +2129,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
&BTRFS_I(inode)->runtime_flags);
|
||||
|
||||
/*
|
||||
* Before we acquired the inode's lock, someone may have dirtied more
|
||||
* pages in the target range. We need to make sure that writeback for
|
||||
* any such pages does not start while we are logging the inode, because
|
||||
* if it does, any of the following might happen when we are not doing a
|
||||
* full inode sync:
|
||||
* Before we acquired the inode's lock and the mmap lock, someone may
|
||||
* have dirtied more pages in the target range. We need to make sure
|
||||
* that writeback for any such pages does not start while we are logging
|
||||
* the inode, because if it does, any of the following might happen when
|
||||
* we are not doing a full inode sync:
|
||||
*
|
||||
* 1) We log an extent after its writeback finishes but before its
|
||||
* checksums are added to the csum tree, leading to -EIO errors
|
||||
@ -2154,7 +2148,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
*/
|
||||
ret = start_ordered_ops(inode, start, end);
|
||||
if (ret) {
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -2255,7 +2249,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
* file again, but that will end up using the synchronization
|
||||
* inside btrfs_sync_log to keep things safe.
|
||||
*/
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
|
||||
if (ret != BTRFS_NO_LOG_SYNC) {
|
||||
if (!ret) {
|
||||
@ -2285,7 +2279,7 @@ out:
|
||||
|
||||
out_release_extents:
|
||||
btrfs_release_log_ctx_extents(&ctx);
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -2605,16 +2599,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
|
||||
* extents without inserting a new one, so we must abort the transaction to avoid
|
||||
* a corruption.
|
||||
*/
|
||||
int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
const u64 start, const u64 end,
|
||||
int btrfs_replace_file_extents(struct btrfs_inode *inode,
|
||||
struct btrfs_path *path, const u64 start,
|
||||
const u64 end,
|
||||
struct btrfs_replace_extent_info *extent_info,
|
||||
struct btrfs_trans_handle **trans_out)
|
||||
{
|
||||
struct btrfs_drop_extents_args drop_args = { 0 };
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
|
||||
u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
|
||||
struct btrfs_root *root = BTRFS_I(inode)->root;
|
||||
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
|
||||
struct btrfs_trans_handle *trans = NULL;
|
||||
struct btrfs_block_rsv *rsv;
|
||||
unsigned int rsv_count;
|
||||
@ -2662,10 +2657,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
drop_args.drop_cache = true;
|
||||
while (cur_offset < end) {
|
||||
drop_args.start = cur_offset;
|
||||
ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
|
||||
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
|
||||
/* If we are punching a hole decrement the inode's byte count */
|
||||
if (!extent_info)
|
||||
btrfs_update_inode_bytes(BTRFS_I(inode), 0,
|
||||
btrfs_update_inode_bytes(inode, 0,
|
||||
drop_args.bytes_found);
|
||||
if (ret != -ENOSPC) {
|
||||
/*
|
||||
@ -2685,8 +2680,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
|
||||
if (!extent_info && cur_offset < drop_args.drop_end &&
|
||||
cur_offset < ino_size) {
|
||||
ret = fill_holes(trans, BTRFS_I(inode), path,
|
||||
cur_offset, drop_args.drop_end);
|
||||
ret = fill_holes(trans, inode, path, cur_offset,
|
||||
drop_args.drop_end);
|
||||
if (ret) {
|
||||
/*
|
||||
* If we failed then we didn't insert our hole
|
||||
@ -2704,7 +2699,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
* know to not set disk_i_size in this area until a new
|
||||
* file extent is inserted here.
|
||||
*/
|
||||
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
|
||||
ret = btrfs_inode_clear_file_extent_range(inode,
|
||||
cur_offset,
|
||||
drop_args.drop_end - cur_offset);
|
||||
if (ret) {
|
||||
@ -2723,8 +2718,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
u64 replace_len = drop_args.drop_end -
|
||||
extent_info->file_offset;
|
||||
|
||||
ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode),
|
||||
path, extent_info, replace_len,
|
||||
ret = btrfs_insert_replace_extent(trans, inode, path,
|
||||
extent_info, replace_len,
|
||||
drop_args.bytes_found);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
@ -2735,9 +2730,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
extent_info->file_offset += replace_len;
|
||||
}
|
||||
|
||||
cur_offset = drop_args.drop_end;
|
||||
|
||||
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
|
||||
ret = btrfs_update_inode(trans, root, inode);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
@ -2756,9 +2749,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
BUG_ON(ret); /* shouldn't happen */
|
||||
trans->block_rsv = rsv;
|
||||
|
||||
if (!extent_info) {
|
||||
ret = find_first_non_hole(BTRFS_I(inode), &cur_offset,
|
||||
&len);
|
||||
cur_offset = drop_args.drop_end;
|
||||
len = end - cur_offset;
|
||||
if (!extent_info && len) {
|
||||
ret = find_first_non_hole(inode, &cur_offset, &len);
|
||||
if (unlikely(ret < 0))
|
||||
break;
|
||||
if (ret && !len) {
|
||||
@ -2771,14 +2765,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
/*
|
||||
* If we were cloning, force the next fsync to be a full one since we
|
||||
* we replaced (or just dropped in the case of cloning holes when
|
||||
* NO_HOLES is enabled) extents and extent maps.
|
||||
* This is for the sake of simplicity, and cloning into files larger
|
||||
* than 16Mb would force the full fsync any way (when
|
||||
* try_release_extent_mapping() is invoked during page cache truncation.
|
||||
* NO_HOLES is enabled) file extent items and did not setup new extent
|
||||
* maps for the replacement extents (or holes).
|
||||
*/
|
||||
if (extent_info && !extent_info->is_new_extent)
|
||||
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
||||
&BTRFS_I(inode)->runtime_flags);
|
||||
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
|
||||
|
||||
if (ret)
|
||||
goto out_trans;
|
||||
@ -2804,8 +2795,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
*/
|
||||
if (!extent_info && cur_offset < ino_size &&
|
||||
cur_offset < drop_args.drop_end) {
|
||||
ret = fill_holes(trans, BTRFS_I(inode), path,
|
||||
cur_offset, drop_args.drop_end);
|
||||
ret = fill_holes(trans, inode, path, cur_offset,
|
||||
drop_args.drop_end);
|
||||
if (ret) {
|
||||
/* Same comment as above. */
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
@ -2813,8 +2804,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
}
|
||||
} else if (!extent_info && cur_offset < drop_args.drop_end) {
|
||||
/* See the comment in the loop above for the reasoning here. */
|
||||
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
|
||||
cur_offset, drop_args.drop_end - cur_offset);
|
||||
ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
|
||||
drop_args.drop_end - cur_offset);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto out_trans;
|
||||
@ -2822,7 +2813,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
|
||||
|
||||
}
|
||||
if (extent_info) {
|
||||
ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path,
|
||||
ret = btrfs_insert_replace_extent(trans, inode, path,
|
||||
extent_info, extent_info->data_len,
|
||||
drop_args.bytes_found);
|
||||
if (ret) {
|
||||
@ -2868,7 +2859,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
|
||||
ino_size = round_up(inode->i_size, fs_info->sectorsize);
|
||||
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
|
||||
if (ret < 0)
|
||||
@ -2908,7 +2899,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
truncated_block = true;
|
||||
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
|
||||
if (ret) {
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
@ -2967,8 +2958,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
|
||||
&trans);
|
||||
ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
|
||||
lockend, NULL, &trans);
|
||||
btrfs_free_path(path);
|
||||
if (ret)
|
||||
goto out;
|
||||
@ -3009,7 +3000,7 @@ out_only_mutex:
|
||||
ret = ret2;
|
||||
}
|
||||
}
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -3335,7 +3326,7 @@ static long btrfs_fallocate(struct file *file, int mode,
|
||||
return ret;
|
||||
}
|
||||
|
||||
btrfs_inode_lock(inode, 0);
|
||||
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
|
||||
|
||||
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
|
||||
ret = inode_newsize_ok(inode, offset + len);
|
||||
@ -3377,7 +3368,7 @@ static long btrfs_fallocate(struct file *file, int mode,
|
||||
|
||||
if (mode & FALLOC_FL_ZERO_RANGE) {
|
||||
ret = btrfs_zero_range(inode, offset, len, mode);
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -3487,7 +3478,7 @@ out_unlock:
|
||||
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
|
||||
&cached_state);
|
||||
out:
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
|
||||
/* Let go of our reservation. */
|
||||
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
|
||||
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
|
||||
@ -3496,13 +3487,13 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static loff_t find_desired_extent(struct inode *inode, loff_t offset,
|
||||
static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
|
||||
int whence)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
struct extent_map *em = NULL;
|
||||
struct extent_state *cached_state = NULL;
|
||||
loff_t i_size = inode->i_size;
|
||||
loff_t i_size = inode->vfs_inode.i_size;
|
||||
u64 lockstart;
|
||||
u64 lockend;
|
||||
u64 start;
|
||||
@ -3525,11 +3516,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
|
||||
lockend--;
|
||||
len = lockend - lockstart + 1;
|
||||
|
||||
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
||||
&cached_state);
|
||||
lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
|
||||
|
||||
while (start < i_size) {
|
||||
em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
|
||||
em = btrfs_get_extent_fiemap(inode, start, len);
|
||||
if (IS_ERR(em)) {
|
||||
ret = PTR_ERR(em);
|
||||
em = NULL;
|
||||
@ -3551,7 +3541,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
|
||||
cond_resched();
|
||||
}
|
||||
free_extent_map(em);
|
||||
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
||||
unlock_extent_cached(&inode->io_tree, lockstart, lockend,
|
||||
&cached_state);
|
||||
if (ret) {
|
||||
offset = ret;
|
||||
@ -3575,7 +3565,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
|
||||
case SEEK_DATA:
|
||||
case SEEK_HOLE:
|
||||
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
|
||||
offset = find_desired_extent(inode, offset, whence);
|
||||
offset = find_desired_extent(BTRFS_I(inode), offset, whence);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
break;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/error-injection.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "misc.h"
|
||||
#include "ctree.h"
|
||||
#include "free-space-cache.h"
|
||||
#include "transaction.h"
|
||||
@ -2539,6 +2540,7 @@ out:
|
||||
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
|
||||
u64 bytenr, u64 size, bool used)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = block_group->fs_info;
|
||||
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
|
||||
u64 offset = bytenr - block_group->start;
|
||||
u64 to_free, to_unusable;
|
||||
@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
|
||||
}
|
||||
|
||||
/* All the region is now unusable. Mark it as unused and reclaim */
|
||||
if (block_group->zone_unusable == block_group->length)
|
||||
if (block_group->zone_unusable == block_group->length) {
|
||||
btrfs_mark_bg_unused(block_group);
|
||||
} else if (block_group->zone_unusable >=
|
||||
div_factor_fine(block_group->length,
|
||||
fs_info->bg_reclaim_threshold)) {
|
||||
btrfs_mark_bg_to_reclaim(block_group);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
123
fs/btrfs/inode.c
123
fs/btrfs/inode.c
@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
|
||||
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
|
||||
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
|
||||
* return -EAGAIN
|
||||
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
|
||||
*/
|
||||
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
|
||||
{
|
||||
@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
|
||||
}
|
||||
inode_lock(inode);
|
||||
}
|
||||
if (ilock_flags & BTRFS_ILOCK_MMAP)
|
||||
down_write(&BTRFS_I(inode)->i_mmap_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
|
||||
*/
|
||||
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
|
||||
{
|
||||
if (ilock_flags & BTRFS_ILOCK_MMAP)
|
||||
up_write(&BTRFS_I(inode)->i_mmap_lock);
|
||||
if (ilock_flags & BTRFS_ILOCK_SHARED)
|
||||
inode_unlock_shared(inode);
|
||||
else
|
||||
@ -1516,7 +1521,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
|
||||
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
struct page *locked_page,
|
||||
const u64 start, const u64 end,
|
||||
int *page_started, int force,
|
||||
int *page_started,
|
||||
unsigned long *nr_written)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
@ -1530,6 +1535,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
u64 ino = btrfs_ino(inode);
|
||||
bool nocow = false;
|
||||
u64 disk_bytenr = 0;
|
||||
const bool force = inode->flags & BTRFS_INODE_NODATACOW;
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path) {
|
||||
@ -1863,23 +1869,16 @@ error:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
|
||||
static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
|
||||
{
|
||||
|
||||
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
|
||||
!(inode->flags & BTRFS_INODE_PREALLOC))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* @defrag_bytes is a hint value, no spinlock held here,
|
||||
* if is not zero, it means the file is defragging.
|
||||
* Force cow if given extent needs to be defragged.
|
||||
*/
|
||||
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
|
||||
if (inode->defrag_bytes &&
|
||||
test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
|
||||
0, NULL))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1891,17 +1890,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
|
||||
struct writeback_control *wbc)
|
||||
{
|
||||
int ret;
|
||||
int force_cow = need_force_cow(inode, start, end);
|
||||
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
|
||||
|
||||
if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
|
||||
if (should_nocow(inode, start, end)) {
|
||||
ASSERT(!zoned);
|
||||
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
||||
page_started, 1, nr_written);
|
||||
} else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
|
||||
ASSERT(!zoned);
|
||||
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
||||
page_started, 0, nr_written);
|
||||
page_started, nr_written);
|
||||
} else if (!inode_can_compress(inode) ||
|
||||
!inode_need_compress(inode, start, end)) {
|
||||
if (zoned)
|
||||
@ -3151,10 +3145,9 @@ zeroit:
|
||||
* @bio_offset: offset to the beginning of the bio (in bytes)
|
||||
* @start: file offset of the range start
|
||||
* @end: file offset of the range end (inclusive)
|
||||
* @mirror: mirror number
|
||||
*/
|
||||
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
|
||||
struct page *page, u64 start, u64 end, int mirror)
|
||||
struct page *page, u64 start, u64 end)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
||||
@ -3393,15 +3386,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
|
||||
int is_dead_root = 0;
|
||||
|
||||
/*
|
||||
* this is an orphan in the tree root. Currently these
|
||||
* This is an orphan in the tree root. Currently these
|
||||
* could come from 2 sources:
|
||||
* a) a snapshot deletion in progress
|
||||
* a) a root (snapshot/subvolume) deletion in progress
|
||||
* b) a free space cache inode
|
||||
* We need to distinguish those two, as the snapshot
|
||||
* orphan must not get deleted.
|
||||
* find_dead_roots already ran before us, so if this
|
||||
* is a snapshot deletion, we should find the root
|
||||
* in the fs_roots radix tree.
|
||||
* We need to distinguish those two, as the orphan item
|
||||
* for a root must not get deleted before the deletion
|
||||
* of the snapshot/subvolume's tree completes.
|
||||
*
|
||||
* btrfs_find_orphan_roots() ran before us, which has
|
||||
* found all deleted roots and loaded them into
|
||||
* fs_info->fs_roots_radix. So here we can find if an
|
||||
* orphan item corresponds to a deleted root by looking
|
||||
* up the root from that radix tree.
|
||||
*/
|
||||
|
||||
spin_lock(&fs_info->fs_roots_radix_lock);
|
||||
@ -4332,7 +4329,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
|
||||
goto out_end_trans;
|
||||
}
|
||||
|
||||
btrfs_record_root_in_trans(trans, dest);
|
||||
ret = btrfs_record_root_in_trans(trans, dest);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto out_end_trans;
|
||||
}
|
||||
|
||||
memset(&dest->root_item.drop_progress, 0,
|
||||
sizeof(dest->root_item.drop_progress));
|
||||
@ -7026,7 +7027,7 @@ next:
|
||||
if (ret)
|
||||
goto out;
|
||||
} else {
|
||||
map = kmap(page);
|
||||
map = kmap_local_page(page);
|
||||
read_extent_buffer(leaf, map + pg_offset, ptr,
|
||||
copy_size);
|
||||
if (pg_offset + copy_size < PAGE_SIZE) {
|
||||
@ -7034,7 +7035,7 @@ next:
|
||||
PAGE_SIZE - pg_offset -
|
||||
copy_size);
|
||||
}
|
||||
kunmap(page);
|
||||
kunmap_local(map);
|
||||
}
|
||||
flush_dcache_page(page);
|
||||
}
|
||||
@ -7262,6 +7263,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
|
||||
return em;
|
||||
}
|
||||
|
||||
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
|
||||
{
|
||||
struct btrfs_block_group *block_group;
|
||||
bool readonly = false;
|
||||
|
||||
block_group = btrfs_lookup_block_group(fs_info, bytenr);
|
||||
if (!block_group || block_group->ro)
|
||||
readonly = true;
|
||||
if (block_group)
|
||||
btrfs_put_block_group(block_group);
|
||||
return readonly;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we can do nocow write into the range [@offset, @offset + @len)
|
||||
*
|
||||
@ -8403,17 +8417,11 @@ again:
|
||||
* for the finish_ordered_io
|
||||
*/
|
||||
if (TestClearPagePrivate2(page)) {
|
||||
struct btrfs_ordered_inode_tree *tree;
|
||||
u64 new_len;
|
||||
|
||||
tree = &inode->ordered_tree;
|
||||
|
||||
spin_lock_irq(&tree->lock);
|
||||
spin_lock_irq(&inode->ordered_tree.lock);
|
||||
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
|
||||
new_len = start - ordered->file_offset;
|
||||
if (new_len < ordered->truncated_len)
|
||||
ordered->truncated_len = new_len;
|
||||
spin_unlock_irq(&tree->lock);
|
||||
ordered->truncated_len = min(ordered->truncated_len,
|
||||
start - ordered->file_offset);
|
||||
spin_unlock_irq(&inode->ordered_tree.lock);
|
||||
|
||||
if (btrfs_dec_test_ordered_pending(inode, &ordered,
|
||||
start,
|
||||
@ -8539,6 +8547,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
|
||||
|
||||
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
|
||||
again:
|
||||
down_read(&BTRFS_I(inode)->i_mmap_lock);
|
||||
lock_page(page);
|
||||
size = i_size_read(inode);
|
||||
|
||||
@ -8567,6 +8576,7 @@ again:
|
||||
unlock_extent_cached(io_tree, page_start, page_end,
|
||||
&cached_state);
|
||||
unlock_page(page);
|
||||
up_read(&BTRFS_I(inode)->i_mmap_lock);
|
||||
btrfs_start_ordered_extent(ordered, 1);
|
||||
btrfs_put_ordered_extent(ordered);
|
||||
goto again;
|
||||
@ -8619,11 +8629,10 @@ again:
|
||||
set_page_dirty(page);
|
||||
SetPageUptodate(page);
|
||||
|
||||
BTRFS_I(inode)->last_trans = fs_info->generation;
|
||||
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
|
||||
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
|
||||
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
|
||||
|
||||
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
|
||||
up_read(&BTRFS_I(inode)->i_mmap_lock);
|
||||
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
@ -8632,6 +8641,7 @@ again:
|
||||
|
||||
out_unlock:
|
||||
unlock_page(page);
|
||||
up_read(&BTRFS_I(inode)->i_mmap_lock);
|
||||
out:
|
||||
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
|
||||
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
|
||||
@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
|
||||
INIT_LIST_HEAD(&ei->delalloc_inodes);
|
||||
INIT_LIST_HEAD(&ei->delayed_iput);
|
||||
RB_CLEAR_NODE(&ei->rb_node);
|
||||
init_rwsem(&ei->i_mmap_lock);
|
||||
|
||||
return inode;
|
||||
}
|
||||
@ -9101,8 +9112,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
|
||||
goto out_notrans;
|
||||
}
|
||||
|
||||
if (dest != root)
|
||||
btrfs_record_root_in_trans(trans, dest);
|
||||
if (dest != root) {
|
||||
ret = btrfs_record_root_in_trans(trans, dest);
|
||||
if (ret)
|
||||
goto out_fail;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to find a free sequence number both in the source and
|
||||
@ -9406,8 +9420,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
goto out_notrans;
|
||||
}
|
||||
|
||||
if (dest != root)
|
||||
btrfs_record_root_in_trans(trans, dest);
|
||||
if (dest != root) {
|
||||
ret = btrfs_record_root_in_trans(trans, dest);
|
||||
if (ret)
|
||||
goto out_fail;
|
||||
}
|
||||
|
||||
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
|
||||
if (ret)
|
||||
@ -9919,7 +9936,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
|
||||
goto free_qgroup;
|
||||
}
|
||||
|
||||
ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
|
||||
ret = btrfs_replace_file_extents(inode, path, file_offset,
|
||||
file_offset + len - 1, &extent_info,
|
||||
&trans);
|
||||
btrfs_free_path(path);
|
||||
|
@ -226,7 +226,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, 0);
|
||||
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
|
||||
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
|
||||
|
||||
@ -353,7 +353,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
|
||||
out_end_trans:
|
||||
btrfs_end_transaction(trans);
|
||||
out_unlock:
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, 0);
|
||||
mnt_drop_write_file(file);
|
||||
return ret;
|
||||
}
|
||||
@ -449,7 +449,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, 0);
|
||||
|
||||
old_flags = binode->flags;
|
||||
old_i_flags = inode->i_flags;
|
||||
@ -501,7 +501,7 @@ out_unlock:
|
||||
inode->i_flags = old_i_flags;
|
||||
}
|
||||
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, 0);
|
||||
mnt_drop_write_file(file);
|
||||
|
||||
return ret;
|
||||
@ -697,8 +697,6 @@ static noinline int create_subvol(struct inode *dir,
|
||||
btrfs_set_root_otransid(root_item, trans->transid);
|
||||
|
||||
btrfs_tree_unlock(leaf);
|
||||
free_extent_buffer(leaf);
|
||||
leaf = NULL;
|
||||
|
||||
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
|
||||
|
||||
@ -707,8 +705,22 @@ static noinline int create_subvol(struct inode *dir,
|
||||
key.type = BTRFS_ROOT_ITEM_KEY;
|
||||
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
|
||||
root_item);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
/*
|
||||
* Since we don't abort the transaction in this case, free the
|
||||
* tree block so that we don't leak space and leave the
|
||||
* filesystem in an inconsistent state (an extent item in the
|
||||
* extent tree without backreferences). Also no need to have
|
||||
* the tree block locked since it is not in any tree at this
|
||||
* point, so no other task can find it and use it.
|
||||
*/
|
||||
btrfs_free_tree_block(trans, root, leaf, 0, 1);
|
||||
free_extent_buffer(leaf);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
free_extent_buffer(leaf);
|
||||
leaf = NULL;
|
||||
|
||||
key.offset = (u64)-1;
|
||||
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
|
||||
@ -721,7 +733,12 @@ static noinline int create_subvol(struct inode *dir,
|
||||
/* Freeing will be done in btrfs_put_root() of new_root */
|
||||
anon_dev = 0;
|
||||
|
||||
btrfs_record_root_in_trans(trans, new_root);
|
||||
ret = btrfs_record_root_in_trans(trans, new_root);
|
||||
if (ret) {
|
||||
btrfs_put_root(new_root);
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ret = btrfs_create_subvol_root(trans, new_root, root);
|
||||
btrfs_put_root(new_root);
|
||||
@ -1014,7 +1031,7 @@ out_up_read:
|
||||
out_dput:
|
||||
dput(dentry);
|
||||
out_unlock:
|
||||
inode_unlock(dir);
|
||||
btrfs_inode_unlock(dir, 0);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -1612,7 +1629,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
|
||||
ra_index += cluster;
|
||||
}
|
||||
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, 0);
|
||||
if (IS_SWAPFILE(inode)) {
|
||||
ret = -ETXTBSY;
|
||||
} else {
|
||||
@ -1621,13 +1638,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
|
||||
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
|
||||
}
|
||||
if (ret < 0) {
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, 0);
|
||||
goto out_ra;
|
||||
}
|
||||
|
||||
defrag_count += ret;
|
||||
balance_dirty_pages_ratelimited(inode->i_mapping);
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, 0);
|
||||
|
||||
if (newer_than) {
|
||||
if (newer_off == (u64)-1)
|
||||
@ -1675,9 +1692,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
|
||||
|
||||
out_ra:
|
||||
if (do_compress) {
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, 0);
|
||||
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, 0);
|
||||
}
|
||||
if (!file)
|
||||
kfree(ra);
|
||||
@ -3112,9 +3129,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
inode_lock(inode);
|
||||
btrfs_inode_lock(inode, 0);
|
||||
err = btrfs_delete_subvolume(dir, dentry);
|
||||
inode_unlock(inode);
|
||||
btrfs_inode_unlock(inode, 0);
|
||||
if (!err) {
|
||||
fsnotify_rmdir(dir, dentry);
|
||||
d_delete(dentry);
|
||||
@ -3123,7 +3140,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
|
||||
out_dput:
|
||||
dput(dentry);
|
||||
out_unlock_dir:
|
||||
inode_unlock(dir);
|
||||
btrfs_inode_unlock(dir, 0);
|
||||
free_subvol_name:
|
||||
kfree(subvol_name_ptr);
|
||||
free_parent:
|
||||
|
@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
struct workspace *workspace = list_entry(ws, struct workspace, list);
|
||||
int ret = 0;
|
||||
char *data_in;
|
||||
char *cpage_out;
|
||||
char *cpage_out, *sizes_ptr;
|
||||
int nr_pages = 0;
|
||||
struct page *in_page = NULL;
|
||||
struct page *out_page = NULL;
|
||||
@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
|
||||
}
|
||||
|
||||
/* store the size of all chunks of compressed data */
|
||||
cpage_out = kmap(pages[0]);
|
||||
write_compress_length(cpage_out, tot_out);
|
||||
|
||||
kunmap(pages[0]);
|
||||
sizes_ptr = kmap_local_page(pages[0]);
|
||||
write_compress_length(sizes_ptr, tot_out);
|
||||
kunmap_local(sizes_ptr);
|
||||
|
||||
ret = 0;
|
||||
*total_out = tot_out;
|
||||
|
@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* helper to check if a given offset is inside a given entry
|
||||
*/
|
||||
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
|
||||
{
|
||||
if (file_offset < entry->file_offset ||
|
||||
entry->file_offset + entry->num_bytes <= file_offset)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
|
||||
u64 len)
|
||||
{
|
||||
@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
|
||||
if (tree->last) {
|
||||
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
|
||||
rb_node);
|
||||
if (offset_in_entry(entry, file_offset))
|
||||
if (in_range(file_offset, entry->file_offset, entry->num_bytes))
|
||||
return tree->last;
|
||||
}
|
||||
ret = __tree_search(root, file_offset, &prev);
|
||||
@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
|
||||
goto out;
|
||||
|
||||
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
|
||||
if (!offset_in_entry(entry, *file_offset))
|
||||
if (!in_range(*file_offset, entry->file_offset, entry->num_bytes))
|
||||
goto out;
|
||||
|
||||
dec_start = max(*file_offset, entry->file_offset);
|
||||
@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
|
||||
|
||||
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
|
||||
have_entry:
|
||||
if (!offset_in_entry(entry, file_offset))
|
||||
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
|
||||
goto out;
|
||||
|
||||
if (io_size > entry->bytes_left)
|
||||
@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
|
||||
goto out;
|
||||
|
||||
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
|
||||
if (!offset_in_entry(entry, file_offset))
|
||||
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
|
||||
entry = NULL;
|
||||
if (entry)
|
||||
refcount_inc(&entry->refs);
|
||||
|
@ -39,8 +39,8 @@ struct btrfs_ordered_sum {
|
||||
*/
|
||||
enum {
|
||||
/*
|
||||
* Different types for direct io, one and only one of the 4 type can
|
||||
* be set when creating ordered extent.
|
||||
* Different types for ordered extents, one and only one of the 4 types
|
||||
* need to be set when creating ordered extent.
|
||||
*
|
||||
* REGULAR: For regular non-compressed COW write
|
||||
* NOCOW: For NOCOW write into existing non-hole extent
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "qgroup.h"
|
||||
#include "block-group.h"
|
||||
#include "sysfs.h"
|
||||
#include "tree-mod-log.h"
|
||||
|
||||
/* TODO XXX FIXME
|
||||
* - subvol delete -> delete when ref goes to 0? delete limits also?
|
||||
@ -2639,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
|
||||
record->data_rsv,
|
||||
BTRFS_QGROUP_RSV_DATA);
|
||||
/*
|
||||
* Use SEQ_LAST as time_seq to do special search, which
|
||||
* doesn't lock tree or delayed_refs and search current
|
||||
* root. It's safe inside commit_transaction().
|
||||
* Use BTRFS_SEQ_LAST as time_seq to do special search,
|
||||
* which doesn't lock tree or delayed_refs and search
|
||||
* current root. It's safe inside commit_transaction().
|
||||
*/
|
||||
ret = btrfs_find_all_roots(trans, fs_info,
|
||||
record->bytenr, SEQ_LAST, &new_roots, false);
|
||||
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
|
||||
if (ret < 0)
|
||||
goto cleanup;
|
||||
if (qgroup_to_skip) {
|
||||
@ -3543,37 +3544,19 @@ static int try_flush_qgroup(struct btrfs_root *root)
|
||||
{
|
||||
struct btrfs_trans_handle *trans;
|
||||
int ret;
|
||||
bool can_commit = true;
|
||||
|
||||
/*
|
||||
* If current process holds a transaction, we shouldn't flush, as we
|
||||
* assume all space reservation happens before a transaction handle is
|
||||
* held.
|
||||
*
|
||||
* But there are cases like btrfs_delayed_item_reserve_metadata() where
|
||||
* we try to reserve space with one transction handle already held.
|
||||
* In that case we can't commit transaction, but at least try to end it
|
||||
* and hope the started data writes can free some space.
|
||||
*/
|
||||
if (current->journal_info &&
|
||||
current->journal_info != BTRFS_SEND_TRANS_STUB)
|
||||
can_commit = false;
|
||||
/* Can't hold an open transaction or we run the risk of deadlocking */
|
||||
ASSERT(current->journal_info == NULL ||
|
||||
current->journal_info == BTRFS_SEND_TRANS_STUB);
|
||||
if (WARN_ON(current->journal_info &&
|
||||
current->journal_info != BTRFS_SEND_TRANS_STUB))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We don't want to run flush again and again, so if there is a running
|
||||
* one, we won't try to start a new flush, but exit directly.
|
||||
*/
|
||||
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
|
||||
/*
|
||||
* We are already holding a transaction, thus we can block other
|
||||
* threads from flushing. So exit right now. This increases
|
||||
* the chance of EDQUOT for heavy load and near limit cases.
|
||||
* But we can argue that if we're already near limit, EDQUOT is
|
||||
* unavoidable anyway.
|
||||
*/
|
||||
if (!can_commit)
|
||||
return 0;
|
||||
|
||||
wait_event(root->qgroup_flush_wait,
|
||||
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
|
||||
return 0;
|
||||
@ -3590,10 +3573,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (can_commit)
|
||||
ret = btrfs_commit_transaction(trans);
|
||||
else
|
||||
ret = btrfs_end_transaction(trans);
|
||||
out:
|
||||
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
|
||||
wake_up(&root->qgroup_flush_wait);
|
||||
@ -3646,8 +3626,7 @@ cleanup:
|
||||
qgroup_unreserve_range(inode, reserved, start, len);
|
||||
out:
|
||||
if (new_reserved) {
|
||||
extent_changeset_release(reserved);
|
||||
kfree(reserved);
|
||||
extent_changeset_free(reserved);
|
||||
*reserved_ret = NULL;
|
||||
}
|
||||
return ret;
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/raid/xor.h>
|
||||
#include <linux/mm.h>
|
||||
#include "misc.h"
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "volumes.h"
|
||||
@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
|
||||
/* first collect one page from each data stripe */
|
||||
for (stripe = 0; stripe < nr_data; stripe++) {
|
||||
p = page_in_rbio(rbio, stripe, pagenr, 0);
|
||||
pointers[stripe] = kmap(p);
|
||||
pointers[stripe] = kmap_local_page(p);
|
||||
}
|
||||
|
||||
/* then add the parity stripe */
|
||||
p = rbio_pstripe_page(rbio, pagenr);
|
||||
SetPageUptodate(p);
|
||||
pointers[stripe++] = kmap(p);
|
||||
pointers[stripe++] = kmap_local_page(p);
|
||||
|
||||
if (has_qstripe) {
|
||||
|
||||
@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
|
||||
*/
|
||||
p = rbio_qstripe_page(rbio, pagenr);
|
||||
SetPageUptodate(p);
|
||||
pointers[stripe++] = kmap(p);
|
||||
pointers[stripe++] = kmap_local_page(p);
|
||||
|
||||
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
|
||||
pointers);
|
||||
@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
|
||||
copy_page(pointers[nr_data], pointers[0]);
|
||||
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
|
||||
}
|
||||
|
||||
|
||||
for (stripe = 0; stripe < rbio->real_stripes; stripe++)
|
||||
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
|
||||
for (stripe = stripe - 1; stripe >= 0; stripe--)
|
||||
kunmap_local(pointers[stripe]);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1776,6 +1775,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
|
||||
{
|
||||
int pagenr, stripe;
|
||||
void **pointers;
|
||||
void **unmap_array;
|
||||
int faila = -1, failb = -1;
|
||||
struct page *page;
|
||||
blk_status_t err;
|
||||
@ -1787,6 +1787,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
|
||||
goto cleanup_io;
|
||||
}
|
||||
|
||||
/*
|
||||
* Store copy of pointers that does not get reordered during
|
||||
* reconstruction so that kunmap_local works.
|
||||
*/
|
||||
unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
|
||||
if (!unmap_array) {
|
||||
err = BLK_STS_RESOURCE;
|
||||
goto cleanup_pointers;
|
||||
}
|
||||
|
||||
faila = rbio->faila;
|
||||
failb = rbio->failb;
|
||||
|
||||
@ -1808,8 +1818,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
|
||||
!test_bit(pagenr, rbio->dbitmap))
|
||||
continue;
|
||||
|
||||
/* setup our array of pointers with pages
|
||||
* from each stripe
|
||||
/*
|
||||
* Setup our array of pointers with pages from each stripe
|
||||
*
|
||||
* NOTE: store a duplicate array of pointers to preserve the
|
||||
* pointer order
|
||||
*/
|
||||
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
|
||||
/*
|
||||
@ -1823,7 +1836,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
|
||||
} else {
|
||||
page = rbio_stripe_page(rbio, stripe, pagenr);
|
||||
}
|
||||
pointers[stripe] = kmap(page);
|
||||
pointers[stripe] = kmap_local_page(page);
|
||||
unmap_array[stripe] = pointers[stripe];
|
||||
}
|
||||
|
||||
/* all raid6 handling here */
|
||||
@ -1916,24 +1930,14 @@ pstripe:
|
||||
}
|
||||
}
|
||||
}
|
||||
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
|
||||
/*
|
||||
* if we're rebuilding a read, we have to use
|
||||
* pages from the bio list
|
||||
*/
|
||||
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
|
||||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
|
||||
(stripe == faila || stripe == failb)) {
|
||||
page = page_in_rbio(rbio, stripe, pagenr, 0);
|
||||
} else {
|
||||
page = rbio_stripe_page(rbio, stripe, pagenr);
|
||||
}
|
||||
kunmap(page);
|
||||
}
|
||||
for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
|
||||
kunmap_local(unmap_array[stripe]);
|
||||
}
|
||||
|
||||
err = BLK_STS_OK;
|
||||
cleanup:
|
||||
kfree(unmap_array);
|
||||
cleanup_pointers:
|
||||
kfree(pointers);
|
||||
|
||||
cleanup_io:
|
||||
@ -2358,13 +2362,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
|
||||
goto cleanup;
|
||||
}
|
||||
SetPageUptodate(q_page);
|
||||
pointers[rbio->real_stripes - 1] = kmap(q_page);
|
||||
pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
|
||||
}
|
||||
|
||||
atomic_set(&rbio->error, 0);
|
||||
|
||||
/* Map the parity stripe just once */
|
||||
pointers[nr_data] = kmap(p_page);
|
||||
pointers[nr_data] = kmap_local_page(p_page);
|
||||
|
||||
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
|
||||
struct page *p;
|
||||
@ -2372,7 +2376,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
|
||||
/* first collect one page from each data stripe */
|
||||
for (stripe = 0; stripe < nr_data; stripe++) {
|
||||
p = page_in_rbio(rbio, stripe, pagenr, 0);
|
||||
pointers[stripe] = kmap(p);
|
||||
pointers[stripe] = kmap_local_page(p);
|
||||
}
|
||||
|
||||
if (has_qstripe) {
|
||||
@ -2387,22 +2391,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
|
||||
|
||||
/* Check scrubbing parity and repair it */
|
||||
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
|
||||
parity = kmap(p);
|
||||
parity = kmap_local_page(p);
|
||||
if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
|
||||
copy_page(parity, pointers[rbio->scrubp]);
|
||||
else
|
||||
/* Parity is right, needn't writeback */
|
||||
bitmap_clear(rbio->dbitmap, pagenr, 1);
|
||||
kunmap(p);
|
||||
kunmap_local(parity);
|
||||
|
||||
for (stripe = 0; stripe < nr_data; stripe++)
|
||||
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
|
||||
for (stripe = nr_data - 1; stripe >= 0; stripe--)
|
||||
kunmap_local(pointers[stripe]);
|
||||
}
|
||||
|
||||
kunmap(p_page);
|
||||
kunmap_local(pointers[nr_data]);
|
||||
__free_page(p_page);
|
||||
if (q_page) {
|
||||
kunmap(q_page);
|
||||
kunmap_local(pointers[rbio->real_stripes - 1]);
|
||||
__free_page(q_page);
|
||||
}
|
||||
|
||||
|
@ -478,9 +478,9 @@ process_slot:
|
||||
clone_info.file_offset = new_key.offset;
|
||||
clone_info.extent_buf = buf;
|
||||
clone_info.is_new_extent = false;
|
||||
ret = btrfs_replace_file_extents(inode, path, drop_start,
|
||||
new_key.offset + datal - 1, &clone_info,
|
||||
&trans);
|
||||
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
|
||||
drop_start, new_key.offset + datal - 1,
|
||||
&clone_info, &trans);
|
||||
if (ret)
|
||||
goto out;
|
||||
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
|
||||
@ -567,8 +567,8 @@ process_slot:
|
||||
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
||||
&BTRFS_I(inode)->runtime_flags);
|
||||
|
||||
ret = btrfs_replace_file_extents(inode, path, last_dest_end,
|
||||
destoff + len - 1, NULL, &trans);
|
||||
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
|
||||
last_dest_end, destoff + len - 1, NULL, &trans);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@ -604,6 +604,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
|
||||
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
|
||||
}
|
||||
|
||||
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
|
||||
{
|
||||
if (inode1 < inode2)
|
||||
swap(inode1, inode2);
|
||||
down_write(&BTRFS_I(inode1)->i_mmap_lock);
|
||||
down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
|
||||
}
|
||||
|
||||
static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
|
||||
{
|
||||
up_write(&BTRFS_I(inode1)->i_mmap_lock);
|
||||
up_write(&BTRFS_I(inode2)->i_mmap_lock);
|
||||
}
|
||||
|
||||
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
|
||||
struct inode *dst, u64 dst_loff)
|
||||
{
|
||||
@ -820,6 +834,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
||||
len, remap_flags);
|
||||
}
|
||||
|
||||
static bool file_sync_write(const struct file *file)
|
||||
{
|
||||
if (file->f_flags & (__O_SYNC | O_DSYNC))
|
||||
return true;
|
||||
if (IS_SYNC(file_inode(file)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
|
||||
struct file *dst_file, loff_t destoff, loff_t len,
|
||||
unsigned int remap_flags)
|
||||
@ -832,10 +856,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
|
||||
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
|
||||
return -EINVAL;
|
||||
|
||||
if (same_inode)
|
||||
inode_lock(src_inode);
|
||||
else
|
||||
if (same_inode) {
|
||||
btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
|
||||
} else {
|
||||
lock_two_nondirectories(src_inode, dst_inode);
|
||||
btrfs_double_mmap_lock(src_inode, dst_inode);
|
||||
}
|
||||
|
||||
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
|
||||
&len, remap_flags);
|
||||
@ -848,10 +874,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
|
||||
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
|
||||
|
||||
out_unlock:
|
||||
if (same_inode)
|
||||
inode_unlock(src_inode);
|
||||
else
|
||||
if (same_inode) {
|
||||
btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
|
||||
} else {
|
||||
btrfs_double_mmap_unlock(src_inode, dst_inode);
|
||||
unlock_two_nondirectories(src_inode, dst_inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* If either the source or the destination file was opened with O_SYNC,
|
||||
* O_DSYNC or has the S_SYNC attribute, fsync both the destination and
|
||||
* source files/ranges, so that after a successful return (0) followed
|
||||
* by a power failure results in the reflinked data to be readable from
|
||||
* both files/ranges.
|
||||
*/
|
||||
if (ret == 0 && len > 0 &&
|
||||
(file_sync_write(src_file) || file_sync_write(dst_file))) {
|
||||
ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
|
||||
if (ret == 0)
|
||||
ret = btrfs_sync_file(dst_file, destoff,
|
||||
destoff + len - 1, 0);
|
||||
}
|
||||
|
||||
return ret < 0 ? ret : len;
|
||||
}
|
||||
|
@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
|
||||
node->bytenr, &node->rb_node);
|
||||
spin_unlock(&rc->reloc_root_tree.lock);
|
||||
if (rb_node) {
|
||||
btrfs_panic(fs_info, -EEXIST,
|
||||
btrfs_err(fs_info,
|
||||
"Duplicate root found for start=%llu while inserting into relocation tree",
|
||||
node->bytenr);
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
list_add_tail(&root->root_list, &rc->reloc_roots);
|
||||
@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
|
||||
struct extent_buffer *eb;
|
||||
struct btrfs_root_item *root_item;
|
||||
struct btrfs_key root_key;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
bool must_abort = false;
|
||||
|
||||
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
|
||||
BUG_ON(!root_item);
|
||||
if (!root_item)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
|
||||
root_key.type = BTRFS_ROOT_ITEM_KEY;
|
||||
@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
|
||||
/* called by btrfs_init_reloc_root */
|
||||
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
|
||||
BTRFS_TREE_RELOC_OBJECTID);
|
||||
BUG_ON(ret);
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
* Set the last_snapshot field to the generation of the commit
|
||||
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
|
||||
@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
|
||||
*/
|
||||
ret = btrfs_copy_root(trans, root, root->node, &eb,
|
||||
BTRFS_TREE_RELOC_OBJECTID);
|
||||
BUG_ON(ret);
|
||||
if (ret)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/*
|
||||
* We have changed references at this point, we must abort the
|
||||
* transaction if anything fails.
|
||||
*/
|
||||
must_abort = true;
|
||||
|
||||
memcpy(root_item, &root->root_item, sizeof(*root_item));
|
||||
btrfs_set_root_bytenr(root_item, eb->start);
|
||||
btrfs_set_root_level(root_item, btrfs_header_level(eb));
|
||||
@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
|
||||
|
||||
ret = btrfs_insert_root(trans, fs_info->tree_root,
|
||||
&root_key, root_item);
|
||||
BUG_ON(ret);
|
||||
if (ret)
|
||||
goto fail;
|
||||
|
||||
kfree(root_item);
|
||||
|
||||
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
|
||||
BUG_ON(IS_ERR(reloc_root));
|
||||
if (IS_ERR(reloc_root)) {
|
||||
ret = PTR_ERR(reloc_root);
|
||||
goto abort;
|
||||
}
|
||||
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
|
||||
reloc_root->last_trans = trans->transid;
|
||||
return reloc_root;
|
||||
fail:
|
||||
kfree(root_item);
|
||||
abort:
|
||||
if (must_abort)
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
|
||||
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
|
||||
if (clear_rsv)
|
||||
trans->block_rsv = rsv;
|
||||
if (IS_ERR(reloc_root))
|
||||
return PTR_ERR(reloc_root);
|
||||
|
||||
ret = __add_reloc_root(reloc_root);
|
||||
BUG_ON(ret < 0);
|
||||
ASSERT(ret != -EEXIST);
|
||||
if (ret) {
|
||||
/* Pairs with create_reloc_root */
|
||||
btrfs_put_root(reloc_root);
|
||||
return ret;
|
||||
}
|
||||
root->reloc_root = btrfs_grab_root(reloc_root);
|
||||
return 0;
|
||||
}
|
||||
@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
|
||||
int ret;
|
||||
|
||||
if (!have_reloc_root(root))
|
||||
goto out;
|
||||
return 0;
|
||||
|
||||
reloc_root = root->reloc_root;
|
||||
root_item = &reloc_root->root_item;
|
||||
@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
|
||||
|
||||
ret = btrfs_update_root(trans, fs_info->tree_root,
|
||||
&reloc_root->root_key, root_item);
|
||||
BUG_ON(ret);
|
||||
btrfs_put_root(reloc_root);
|
||||
out:
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
|
||||
int ret;
|
||||
int slot;
|
||||
|
||||
BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
|
||||
BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
|
||||
ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
|
||||
ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
|
||||
|
||||
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
|
||||
again:
|
||||
@ -1205,7 +1233,11 @@ again:
|
||||
if (cow) {
|
||||
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
|
||||
BTRFS_NESTING_COW);
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
btrfs_tree_unlock(eb);
|
||||
free_extent_buffer(eb);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (next_key) {
|
||||
@ -1217,7 +1249,7 @@ again:
|
||||
parent = eb;
|
||||
while (1) {
|
||||
level = btrfs_header_level(parent);
|
||||
BUG_ON(level < lowest_level);
|
||||
ASSERT(level >= lowest_level);
|
||||
|
||||
ret = btrfs_bin_search(parent, &key, &slot);
|
||||
if (ret < 0)
|
||||
@ -1265,7 +1297,11 @@ again:
|
||||
ret = btrfs_cow_block(trans, dest, eb, parent,
|
||||
slot, &eb,
|
||||
BTRFS_NESTING_COW);
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
btrfs_tree_unlock(eb);
|
||||
free_extent_buffer(eb);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_tree_unlock(parent);
|
||||
@ -1289,7 +1325,11 @@ again:
|
||||
path->lowest_level = level;
|
||||
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
|
||||
path->lowest_level = 0;
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
if (ret > 0)
|
||||
ret = -ENOENT;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Info qgroup to trace both subtrees.
|
||||
@ -1329,27 +1369,39 @@ again:
|
||||
ref.skip_qgroup = true;
|
||||
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
|
||||
ret = btrfs_inc_extent_ref(trans, &ref);
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
break;
|
||||
}
|
||||
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
|
||||
blocksize, 0);
|
||||
ref.skip_qgroup = true;
|
||||
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
|
||||
ret = btrfs_inc_extent_ref(trans, &ref);
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
|
||||
blocksize, path->nodes[level]->start);
|
||||
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
|
||||
ref.skip_qgroup = true;
|
||||
ret = btrfs_free_extent(trans, &ref);
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
|
||||
blocksize, 0);
|
||||
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
|
||||
ref.skip_qgroup = true;
|
||||
ret = btrfs_free_extent(trans, &ref);
|
||||
BUG_ON(ret);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
break;
|
||||
}
|
||||
|
||||
btrfs_unlock_up_safe(path, 0);
|
||||
|
||||
@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level,
|
||||
/*
|
||||
* Insert current subvolume into reloc_control::dirty_subvol_roots
|
||||
*/
|
||||
static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
|
||||
static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
|
||||
struct reloc_control *rc,
|
||||
struct btrfs_root *root)
|
||||
{
|
||||
struct btrfs_root *reloc_root = root->reloc_root;
|
||||
struct btrfs_root_item *reloc_root_item;
|
||||
int ret;
|
||||
|
||||
/* @root must be a subvolume tree root with a valid reloc tree */
|
||||
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
|
||||
@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
|
||||
sizeof(reloc_root_item->drop_progress));
|
||||
btrfs_set_root_drop_level(reloc_root_item, 0);
|
||||
btrfs_set_root_refs(reloc_root_item, 0);
|
||||
btrfs_update_reloc_root(trans, root);
|
||||
ret = btrfs_update_reloc_root(trans, root);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (list_empty(&root->reloc_dirty_list)) {
|
||||
btrfs_grab_root(root);
|
||||
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int clean_dirty_subvols(struct reloc_control *rc)
|
||||
@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
|
||||
if (ret == 0)
|
||||
insert_dirty_subvol(trans, rc, root);
|
||||
if (ret == 0) {
|
||||
ret = insert_dirty_subvol(trans, rc, root);
|
||||
if (ret)
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
}
|
||||
|
||||
if (trans)
|
||||
btrfs_end_transaction_throttle(trans);
|
||||
@ -1825,8 +1885,18 @@ again:
|
||||
|
||||
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
|
||||
false);
|
||||
BUG_ON(IS_ERR(root));
|
||||
BUG_ON(root->reloc_root != reloc_root);
|
||||
if (IS_ERR(root)) {
|
||||
/*
|
||||
* Even if we have an error we need this reloc root
|
||||
* back on our list so we can clean up properly.
|
||||
*/
|
||||
list_add(&reloc_root->root_list, &reloc_roots);
|
||||
btrfs_abort_transaction(trans, (int)PTR_ERR(root));
|
||||
if (!err)
|
||||
err = PTR_ERR(root);
|
||||
break;
|
||||
}
|
||||
ASSERT(root->reloc_root == reloc_root);
|
||||
|
||||
/*
|
||||
* set reference count to 1, so btrfs_recover_relocation
|
||||
@ -1834,16 +1904,27 @@ again:
|
||||
*/
|
||||
if (!err)
|
||||
btrfs_set_root_refs(&reloc_root->root_item, 1);
|
||||
btrfs_update_reloc_root(trans, root);
|
||||
ret = btrfs_update_reloc_root(trans, root);
|
||||
|
||||
/*
|
||||
* Even if we have an error we need this reloc root back on our
|
||||
* list so we can clean up properly.
|
||||
*/
|
||||
list_add(&reloc_root->root_list, &reloc_roots);
|
||||
btrfs_put_root(root);
|
||||
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
if (!err)
|
||||
err = ret;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
list_splice(&reloc_roots, &rc->reloc_roots);
|
||||
|
||||
if (!err)
|
||||
btrfs_commit_transaction(trans);
|
||||
err = btrfs_commit_transaction(trans);
|
||||
else
|
||||
btrfs_end_transaction(trans);
|
||||
return err;
|
||||
@ -1888,8 +1969,29 @@ again:
|
||||
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
|
||||
false);
|
||||
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
|
||||
BUG_ON(IS_ERR(root));
|
||||
BUG_ON(root->reloc_root != reloc_root);
|
||||
if (IS_ERR(root)) {
|
||||
/*
|
||||
* For recovery we read the fs roots on mount,
|
||||
* and if we didn't find the root then we marked
|
||||
* the reloc root as a garbage root. For normal
|
||||
* relocation obviously the root should exist in
|
||||
* memory. However there's no reason we can't
|
||||
* handle the error properly here just in case.
|
||||
*/
|
||||
ASSERT(0);
|
||||
ret = PTR_ERR(root);
|
||||
goto out;
|
||||
}
|
||||
if (root->reloc_root != reloc_root) {
|
||||
/*
|
||||
* This is actually impossible without something
|
||||
* going really wrong (like weird race condition
|
||||
* or cosmic rays).
|
||||
*/
|
||||
ASSERT(0);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
ret = merge_reloc_root(rc, root);
|
||||
btrfs_put_root(root);
|
||||
if (ret) {
|
||||
@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
|
||||
return 0;
|
||||
|
||||
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
|
||||
BUG_ON(IS_ERR(root));
|
||||
BUG_ON(root->reloc_root != reloc_root);
|
||||
|
||||
/*
|
||||
* This should succeed, since we can't have a reloc root without having
|
||||
* already looked up the actual root and created the reloc root for this
|
||||
* root.
|
||||
*
|
||||
* However if there's some sort of corruption where we have a ref to a
|
||||
* reloc root without a corresponding root this could return ENOENT.
|
||||
*/
|
||||
if (IS_ERR(root)) {
|
||||
ASSERT(0);
|
||||
return PTR_ERR(root);
|
||||
}
|
||||
if (root->reloc_root != reloc_root) {
|
||||
ASSERT(0);
|
||||
btrfs_err(fs_info,
|
||||
"root %llu has two reloc roots associated with it",
|
||||
reloc_root->root_key.offset);
|
||||
btrfs_put_root(root);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
ret = btrfs_record_root_in_trans(trans, root);
|
||||
btrfs_put_root(root);
|
||||
|
||||
@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_backref_node *next;
|
||||
struct btrfs_root *root;
|
||||
int index = 0;
|
||||
int ret;
|
||||
|
||||
next = node;
|
||||
while (1) {
|
||||
cond_resched();
|
||||
next = walk_up_backref(next, edges, &index);
|
||||
root = next->root;
|
||||
BUG_ON(!root);
|
||||
BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
|
||||
|
||||
/*
|
||||
* If there is no root, then our references for this block are
|
||||
* incomplete, as we should be able to walk all the way up to a
|
||||
* block that is owned by a root.
|
||||
*
|
||||
* This path is only for SHAREABLE roots, so if we come upon a
|
||||
* non-SHAREABLE root then we have backrefs that resolve
|
||||
* improperly.
|
||||
*
|
||||
* Both of these cases indicate file system corruption, or a bug
|
||||
* in the backref walking code.
|
||||
*/
|
||||
if (!root) {
|
||||
ASSERT(0);
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu doesn't have a backref path ending in a root",
|
||||
node->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
|
||||
ASSERT(0);
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu has multiple refs with one ending in a non-shareable root",
|
||||
node->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
|
||||
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
|
||||
record_reloc_root_in_trans(trans, root);
|
||||
ret = record_reloc_root_in_trans(trans, root);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
break;
|
||||
}
|
||||
|
||||
btrfs_record_root_in_trans(trans, root);
|
||||
ret = btrfs_record_root_in_trans(trans, root);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
root = root->reloc_root;
|
||||
|
||||
/*
|
||||
* We could have raced with another thread which failed, so
|
||||
* root->reloc_root may not be set, return ENOENT in this case.
|
||||
*/
|
||||
if (!root)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
if (next->new_bytenr != root->node->start) {
|
||||
BUG_ON(next->new_bytenr);
|
||||
BUG_ON(!list_empty(&next->list));
|
||||
/*
|
||||
* We just created the reloc root, so we shouldn't have
|
||||
* ->new_bytenr set and this shouldn't be in the changed
|
||||
* list. If it is then we have multiple roots pointing
|
||||
* at the same bytenr which indicates corruption, or
|
||||
* we've made a mistake in the backref walking code.
|
||||
*/
|
||||
ASSERT(next->new_bytenr == 0);
|
||||
ASSERT(list_empty(&next->list));
|
||||
if (next->new_bytenr || !list_empty(&next->list)) {
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
|
||||
node->bytenr, next->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
|
||||
next->new_bytenr = root->node->start;
|
||||
btrfs_put_root(next->root);
|
||||
next->root = btrfs_grab_root(root);
|
||||
@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
|
||||
if (!next || next->level <= node->level)
|
||||
break;
|
||||
}
|
||||
if (!root)
|
||||
return NULL;
|
||||
if (!root) {
|
||||
/*
|
||||
* This can happen if there's fs corruption or if there's a bug
|
||||
* in the backref lookup code.
|
||||
*/
|
||||
ASSERT(0);
|
||||
return ERR_PTR(-ENOENT);
|
||||
}
|
||||
|
||||
next = node;
|
||||
/* setup backref node path for btrfs_reloc_cow_block */
|
||||
@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
|
||||
cond_resched();
|
||||
next = walk_up_backref(next, edges, &index);
|
||||
root = next->root;
|
||||
BUG_ON(!root);
|
||||
|
||||
/*
|
||||
* This can occur if we have incomplete extent refs leading all
|
||||
* the way up a particular path, in this case return -EUCLEAN.
|
||||
*/
|
||||
if (!root)
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
|
||||
/* No other choice for non-shareable tree */
|
||||
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
|
||||
@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
||||
int slot;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(lowest && node->eb);
|
||||
/*
|
||||
* If we are lowest then this is the first time we're processing this
|
||||
* block, and thus shouldn't have an eb associated with it yet.
|
||||
*/
|
||||
ASSERT(!lowest || !node->eb);
|
||||
|
||||
path->lowest_level = node->level + 1;
|
||||
rc->backref_cache.path[node->level] = node;
|
||||
@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
||||
|
||||
upper = edge->node[UPPER];
|
||||
root = select_reloc_root(trans, rc, upper, edges);
|
||||
BUG_ON(!root);
|
||||
if (IS_ERR(root)) {
|
||||
ret = PTR_ERR(root);
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (upper->eb && !upper->locked) {
|
||||
if (!lowest) {
|
||||
@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
||||
free_extent_buffer(eb);
|
||||
if (ret < 0)
|
||||
goto next;
|
||||
BUG_ON(node->eb != eb);
|
||||
/*
|
||||
* We've just COWed this block, it should have updated
|
||||
* the correct backref node entry.
|
||||
*/
|
||||
ASSERT(node->eb == eb);
|
||||
} else {
|
||||
btrfs_set_node_blockptr(upper->eb, slot,
|
||||
node->eb->start);
|
||||
@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
||||
btrfs_init_tree_ref(&ref, node->level,
|
||||
btrfs_header_owner(upper->eb));
|
||||
ret = btrfs_inc_extent_ref(trans, &ref);
|
||||
BUG_ON(ret);
|
||||
|
||||
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
|
||||
BUG_ON(ret);
|
||||
if (!ret)
|
||||
ret = btrfs_drop_subtree(trans, root, eb,
|
||||
upper->eb);
|
||||
if (ret)
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
}
|
||||
next:
|
||||
if (!upper->pending)
|
||||
@ -2302,7 +2498,12 @@ next:
|
||||
}
|
||||
|
||||
path->lowest_level = 0;
|
||||
BUG_ON(ret == -ENOSPC);
|
||||
|
||||
/*
|
||||
* We should have allocated all of our space in the block rsv and thus
|
||||
* shouldn't ENOSPC.
|
||||
*/
|
||||
ASSERT(ret != -ENOSPC);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
|
||||
|
||||
BUG_ON(node->processed);
|
||||
root = select_one_root(node);
|
||||
if (root == ERR_PTR(-ENOENT)) {
|
||||
if (IS_ERR(root)) {
|
||||
ret = PTR_ERR(root);
|
||||
|
||||
/* See explanation in select_one_root for the -EUCLEAN case. */
|
||||
ASSERT(ret == -ENOENT);
|
||||
if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
update_processed_blocks(rc, node);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (root) {
|
||||
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
|
||||
BUG_ON(node->new_bytenr);
|
||||
BUG_ON(!list_empty(&node->list));
|
||||
btrfs_record_root_in_trans(trans, root);
|
||||
/*
|
||||
* This block was the root block of a root, and this is
|
||||
* the first time we're processing the block and thus it
|
||||
* should not have had the ->new_bytenr modified and
|
||||
* should have not been included on the changed list.
|
||||
*
|
||||
* However in the case of corruption we could have
|
||||
* multiple refs pointing to the same block improperly,
|
||||
* and thus we would trip over these checks. ASSERT()
|
||||
* for the developer case, because it could indicate a
|
||||
* bug in the backref code, however error out for a
|
||||
* normal user in the case of corruption.
|
||||
*/
|
||||
ASSERT(node->new_bytenr == 0);
|
||||
ASSERT(list_empty(&node->list));
|
||||
if (node->new_bytenr || !list_empty(&node->list)) {
|
||||
btrfs_err(root->fs_info,
|
||||
"bytenr %llu has improper references to it",
|
||||
node->bytenr);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
ret = btrfs_record_root_in_trans(trans, root);
|
||||
if (ret)
|
||||
goto out;
|
||||
/*
|
||||
* Another thread could have failed, need to check if we
|
||||
* have reloc_root actually set.
|
||||
*/
|
||||
if (!root->reloc_root) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
root = root->reloc_root;
|
||||
node->new_bytenr = root->node->start;
|
||||
btrfs_put_root(node->root);
|
||||
@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
|
||||
return btrfs_end_transaction(trans);
|
||||
}
|
||||
|
||||
inode_lock(&inode->vfs_inode);
|
||||
btrfs_inode_lock(&inode->vfs_inode, 0);
|
||||
for (nr = 0; nr < cluster->nr; nr++) {
|
||||
start = cluster->boundary[nr] - offset;
|
||||
if (nr + 1 < cluster->nr)
|
||||
@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
inode_unlock(&inode->vfs_inode);
|
||||
btrfs_inode_unlock(&inode->vfs_inode, 0);
|
||||
|
||||
if (cur_offset < prealloc_end)
|
||||
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
|
||||
@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc)
|
||||
mutex_unlock(&fs_info->reloc_mutex);
|
||||
}
|
||||
|
||||
static int check_extent_flags(u64 flags)
|
||||
{
|
||||
if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
|
||||
(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
|
||||
return 1;
|
||||
if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
|
||||
!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
|
||||
return 1;
|
||||
if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
|
||||
(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static noinline_for_stack
|
||||
int prepare_to_relocate(struct reloc_control *rc)
|
||||
{
|
||||
@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc)
|
||||
*/
|
||||
return PTR_ERR(trans);
|
||||
}
|
||||
btrfs_commit_transaction(trans);
|
||||
return 0;
|
||||
return btrfs_commit_transaction(trans);
|
||||
}
|
||||
|
||||
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
|
||||
@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_extent_item *ei;
|
||||
u64 flags;
|
||||
u32 item_size;
|
||||
int ret;
|
||||
int err = 0;
|
||||
int progress = 0;
|
||||
@ -3334,19 +3556,7 @@ restart:
|
||||
|
||||
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
||||
struct btrfs_extent_item);
|
||||
item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
|
||||
if (item_size >= sizeof(*ei)) {
|
||||
flags = btrfs_extent_flags(path->nodes[0], ei);
|
||||
ret = check_extent_flags(flags);
|
||||
BUG_ON(ret);
|
||||
} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
|
||||
err = -EINVAL;
|
||||
btrfs_print_v0_err(trans->fs_info);
|
||||
btrfs_abort_transaction(trans, err);
|
||||
break;
|
||||
} else {
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
|
||||
ret = add_tree_block(rc, &key, path, &blocks);
|
||||
@ -3445,7 +3655,9 @@ restart:
|
||||
err = PTR_ERR(trans);
|
||||
goto out_free;
|
||||
}
|
||||
btrfs_commit_transaction(trans);
|
||||
ret = btrfs_commit_transaction(trans);
|
||||
if (ret && !err)
|
||||
err = ret;
|
||||
out_free:
|
||||
ret = clean_dirty_subvols(rc);
|
||||
if (ret < 0 && !err)
|
||||
@ -3488,6 +3700,35 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root, u64 objectid)
|
||||
{
|
||||
struct btrfs_path *path;
|
||||
struct btrfs_key key;
|
||||
int ret = 0;
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
key.objectid = objectid;
|
||||
key.type = BTRFS_INODE_ITEM_KEY;
|
||||
key.offset = 0;
|
||||
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
||||
if (ret) {
|
||||
if (ret > 0)
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
ret = btrfs_del_item(trans, root, path);
|
||||
out:
|
||||
if (ret)
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
btrfs_free_path(path);
|
||||
}
|
||||
|
||||
/*
|
||||
* helper to create inode for data relocation.
|
||||
* the inode is in data relocation tree and its link count is 0
|
||||
@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
|
||||
goto out;
|
||||
|
||||
err = __insert_orphan_inode(trans, root, objectid);
|
||||
BUG_ON(err);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
inode = btrfs_iget(fs_info->sb, objectid, root);
|
||||
BUG_ON(IS_ERR(inode));
|
||||
if (IS_ERR(inode)) {
|
||||
delete_orphan_inode(trans, root, objectid);
|
||||
err = PTR_ERR(inode);
|
||||
inode = NULL;
|
||||
goto out;
|
||||
}
|
||||
BTRFS_I(inode)->index_cnt = group->start;
|
||||
|
||||
err = btrfs_orphan_add(trans, BTRFS_I(inode));
|
||||
@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
|
||||
}
|
||||
|
||||
err = __add_reloc_root(reloc_root);
|
||||
BUG_ON(err < 0); /* -ENOMEM or logic error */
|
||||
ASSERT(err != -EEXIST);
|
||||
if (err) {
|
||||
list_add_tail(&reloc_root->root_list, &reloc_roots);
|
||||
btrfs_put_root(fs_root);
|
||||
btrfs_end_transaction(trans);
|
||||
goto out_unset;
|
||||
}
|
||||
fs_root->reloc_root = btrfs_grab_root(reloc_root);
|
||||
btrfs_put_root(fs_root);
|
||||
}
|
||||
@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
|
||||
return PTR_ERR(reloc_root);
|
||||
|
||||
ret = __add_reloc_root(reloc_root);
|
||||
BUG_ON(ret < 0);
|
||||
ASSERT(ret != -EEXIST);
|
||||
if (ret) {
|
||||
/* Pairs with create_reloc_root */
|
||||
btrfs_put_root(reloc_root);
|
||||
return ret;
|
||||
}
|
||||
new_root->reloc_root = btrfs_grab_root(reloc_root);
|
||||
|
||||
if (rc->create_reloc_tree)
|
||||
|
@ -206,9 +206,6 @@ struct full_stripe_lock {
|
||||
struct mutex mutex;
|
||||
};
|
||||
|
||||
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
|
||||
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
|
||||
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
|
||||
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
|
||||
struct scrub_block *sblocks_for_recheck);
|
||||
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
|
||||
@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
|
||||
static int scrub_checksum_data(struct scrub_block *sblock);
|
||||
static int scrub_checksum_tree_block(struct scrub_block *sblock);
|
||||
static int scrub_checksum_super(struct scrub_block *sblock);
|
||||
static void scrub_block_get(struct scrub_block *sblock);
|
||||
static void scrub_block_put(struct scrub_block *sblock);
|
||||
static void scrub_page_get(struct scrub_page *spage);
|
||||
static void scrub_page_put(struct scrub_page *spage);
|
||||
static void scrub_parity_get(struct scrub_parity *sparity);
|
||||
static void scrub_parity_put(struct scrub_parity *sparity);
|
||||
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
|
||||
struct scrub_page *spage);
|
||||
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
|
||||
u64 physical, struct btrfs_device *dev, u64 flags,
|
||||
u64 gen, int mirror_num, u8 *csum,
|
||||
@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
|
||||
static void scrub_wr_submit(struct scrub_ctx *sctx);
|
||||
static void scrub_wr_bio_end_io(struct bio *bio);
|
||||
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
|
||||
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
|
||||
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
|
||||
static void scrub_put_ctx(struct scrub_ctx *sctx);
|
||||
|
||||
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
|
||||
@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
|
||||
spin_lock(&cache->lock);
|
||||
if (!cache->to_copy) {
|
||||
spin_unlock(&cache->lock);
|
||||
ro_set = 0;
|
||||
goto done;
|
||||
btrfs_put_block_group(cache);
|
||||
goto skip;
|
||||
}
|
||||
spin_unlock(&cache->lock);
|
||||
}
|
||||
@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
|
||||
cache, found_key.offset))
|
||||
ro_set = 0;
|
||||
|
||||
done:
|
||||
down_write(&dev_replace->rwsem);
|
||||
dev_replace->cursor_left = dev_replace->cursor_right;
|
||||
dev_replace->item_needs_writeback = 1;
|
||||
|
@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
|
||||
path = alloc_path_for_send();
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
path->reada = READA_FORWARD_ALWAYS;
|
||||
|
||||
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
|
||||
key.type = BTRFS_INODE_ITEM_KEY;
|
||||
@ -6688,15 +6689,35 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tree_move_down(struct btrfs_path *path, int *level)
|
||||
static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
|
||||
{
|
||||
struct extent_buffer *eb;
|
||||
struct extent_buffer *parent = path->nodes[*level];
|
||||
int slot = path->slots[*level];
|
||||
const int nritems = btrfs_header_nritems(parent);
|
||||
u64 reada_max;
|
||||
u64 reada_done = 0;
|
||||
|
||||
BUG_ON(*level == 0);
|
||||
eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
|
||||
eb = btrfs_read_node_slot(parent, slot);
|
||||
if (IS_ERR(eb))
|
||||
return PTR_ERR(eb);
|
||||
|
||||
/*
|
||||
* Trigger readahead for the next leaves we will process, so that it is
|
||||
* very likely that when we need them they are already in memory and we
|
||||
* will not block on disk IO. For nodes we only do readahead for one,
|
||||
* since the time window between processing nodes is typically larger.
|
||||
*/
|
||||
reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
|
||||
|
||||
for (slot++; slot < nritems && reada_done < reada_max; slot++) {
|
||||
if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
|
||||
btrfs_readahead_node_child(parent, slot);
|
||||
reada_done += eb->fs_info->nodesize;
|
||||
}
|
||||
}
|
||||
|
||||
path->nodes[*level - 1] = eb;
|
||||
path->slots[*level - 1] = 0;
|
||||
(*level)--;
|
||||
@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
|
||||
static int tree_advance(struct btrfs_path *path,
|
||||
int *level, int root_level,
|
||||
int allow_down,
|
||||
struct btrfs_key *key)
|
||||
struct btrfs_key *key,
|
||||
u64 reada_min_gen)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (*level == 0 || !allow_down) {
|
||||
ret = tree_move_next_or_upnext(path, level, root_level);
|
||||
} else {
|
||||
ret = tree_move_down(path, level);
|
||||
ret = tree_move_down(path, level, reada_min_gen);
|
||||
}
|
||||
if (ret >= 0) {
|
||||
if (*level == 0)
|
||||
@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
|
||||
u64 right_blockptr;
|
||||
u64 left_gen;
|
||||
u64 right_gen;
|
||||
u64 reada_min_gen;
|
||||
|
||||
left_path = btrfs_alloc_path();
|
||||
if (!left_path) {
|
||||
@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* Our right root is the parent root, while the left root is the "send"
|
||||
* root. We know that all new nodes/leaves in the left root must have
|
||||
* a generation greater than the right root's generation, so we trigger
|
||||
* readahead for those nodes and leaves of the left root, as we know we
|
||||
* will need to read them at some point.
|
||||
*/
|
||||
reada_min_gen = btrfs_header_generation(right_root->commit_root);
|
||||
up_read(&fs_info->commit_root_sem);
|
||||
|
||||
if (left_level == 0)
|
||||
@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
|
||||
ret = tree_advance(left_path, &left_level,
|
||||
left_root_level,
|
||||
advance_left != ADVANCE_ONLY_NEXT,
|
||||
&left_key);
|
||||
&left_key, reada_min_gen);
|
||||
if (ret == -1)
|
||||
left_end_reached = ADVANCE;
|
||||
else if (ret < 0)
|
||||
@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
|
||||
ret = tree_advance(right_path, &right_level,
|
||||
right_root_level,
|
||||
advance_right != ADVANCE_ONLY_NEXT,
|
||||
&right_key);
|
||||
&right_key, reada_min_gen);
|
||||
if (ret == -1)
|
||||
right_end_reached = ADVANCE;
|
||||
else if (ret < 0)
|
||||
|
@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
|
||||
* of heavy DIO or ordered reservations, preemptive flushing will just
|
||||
* waste time and cause us to slow down.
|
||||
*/
|
||||
ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
|
||||
delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
|
||||
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
|
||||
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
|
||||
if (ordered >= delalloc)
|
||||
used += fs_info->delayed_refs_rsv.reserved +
|
||||
fs_info->delayed_block_rsv.reserved;
|
||||
|
@ -4,6 +4,64 @@
|
||||
#include "ctree.h"
|
||||
#include "subpage.h"
|
||||
|
||||
/*
|
||||
* Subpage (sectorsize < PAGE_SIZE) support overview:
|
||||
*
|
||||
* Limitations:
|
||||
*
|
||||
* - Only support 64K page size for now
|
||||
* This is to make metadata handling easier, as 64K page would ensure
|
||||
* all nodesize would fit inside one page, thus we don't need to handle
|
||||
* cases where a tree block crosses several pages.
|
||||
*
|
||||
* - Only metadata read-write for now
|
||||
* The data read-write part is in development.
|
||||
*
|
||||
* - Metadata can't cross 64K page boundary
|
||||
* btrfs-progs and kernel have done that for a while, thus only ancient
|
||||
* filesystems could have such problem. For such case, do a graceful
|
||||
* rejection.
|
||||
*
|
||||
* Special behavior:
|
||||
*
|
||||
* - Metadata
|
||||
* Metadata read is fully supported.
|
||||
* Meaning when reading one tree block will only trigger the read for the
|
||||
* needed range, other unrelated range in the same page will not be touched.
|
||||
*
|
||||
* Metadata write support is partial.
|
||||
* The writeback is still for the full page, but we will only submit
|
||||
* the dirty extent buffers in the page.
|
||||
*
|
||||
* This means, if we have a metadata page like this:
|
||||
*
|
||||
* Page offset
|
||||
* 0 16K 32K 48K 64K
|
||||
* |/////////| |///////////|
|
||||
* \- Tree block A \- Tree block B
|
||||
*
|
||||
* Even if we just want to writeback tree block A, we will also writeback
|
||||
* tree block B if it's also dirty.
|
||||
*
|
||||
* This may cause extra metadata writeback which results more COW.
|
||||
*
|
||||
* Implementation:
|
||||
*
|
||||
* - Common
|
||||
* Both metadata and data will use a new structure, btrfs_subpage, to
|
||||
* record the status of each sector inside a page. This provides the extra
|
||||
* granularity needed.
|
||||
*
|
||||
* - Metadata
|
||||
* Since we have multiple tree blocks inside one page, we can't rely on page
|
||||
* locking anymore, or we will have greatly reduced concurrency or even
|
||||
* deadlocks (hold one tree lock while trying to lock another tree lock in
|
||||
* the same page).
|
||||
*
|
||||
* Thus for metadata locking, subpage support relies on io_tree locking only.
|
||||
* This means a slightly higher tree locking latency.
|
||||
*/
|
||||
|
||||
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, enum btrfs_subpage_type type)
|
||||
{
|
||||
@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
}
|
||||
|
||||
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len)
|
||||
{
|
||||
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
|
||||
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
subpage->dirty_bitmap |= tmp;
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
set_page_dirty(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Extra clear_and_test function for subpage dirty bitmap.
|
||||
*
|
||||
* Return true if we're the last bits in the dirty_bitmap and clear the
|
||||
* dirty_bitmap.
|
||||
* Return false otherwise.
|
||||
*
|
||||
* NOTE: Callers should manually clear page dirty for true case, as we have
|
||||
* extra handling for tree blocks.
|
||||
*/
|
||||
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len)
|
||||
{
|
||||
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
|
||||
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
|
||||
unsigned long flags;
|
||||
bool last = false;
|
||||
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
subpage->dirty_bitmap &= ~tmp;
|
||||
if (subpage->dirty_bitmap == 0)
|
||||
last = true;
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
return last;
|
||||
}
|
||||
|
||||
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len)
|
||||
{
|
||||
bool last;
|
||||
|
||||
last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
|
||||
if (last)
|
||||
clear_page_dirty_for_io(page);
|
||||
}
|
||||
|
||||
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len)
|
||||
{
|
||||
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
|
||||
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
subpage->writeback_bitmap |= tmp;
|
||||
set_page_writeback(page);
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
}
|
||||
|
||||
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len)
|
||||
{
|
||||
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
|
||||
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
subpage->writeback_bitmap &= ~tmp;
|
||||
if (subpage->writeback_bitmap == 0)
|
||||
end_page_writeback(page);
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlike set/clear which is dependent on each page status, for test all bits
|
||||
* are tested in the same way.
|
||||
@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
|
||||
}
|
||||
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
|
||||
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
|
||||
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
|
||||
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
|
||||
|
||||
/*
|
||||
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
|
||||
@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
|
||||
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
|
||||
PageUptodate);
|
||||
IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
|
||||
IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
|
||||
PageDirty);
|
||||
IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
|
||||
PageWriteback);
|
||||
|
@ -20,6 +20,8 @@ struct btrfs_subpage {
|
||||
spinlock_t lock;
|
||||
u16 uptodate_bitmap;
|
||||
u16 error_bitmap;
|
||||
u16 dirty_bitmap;
|
||||
u16 writeback_bitmap;
|
||||
union {
|
||||
/*
|
||||
* Structures only used by metadata
|
||||
@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
|
||||
|
||||
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
|
||||
DECLARE_BTRFS_SUBPAGE_OPS(error);
|
||||
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
|
||||
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
|
||||
|
||||
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
|
||||
struct page *page, u64 start, u32 len);
|
||||
|
||||
#endif
|
||||
|
@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
|
||||
}
|
||||
#endif
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
|
||||
btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
|
||||
btrfs_warn(fs_info,
|
||||
"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
|
||||
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
|
||||
btrfs_warn(fs_info,
|
||||
"please consider upgrading to 64bit kernel/hardware");
|
||||
}
|
||||
}
|
||||
|
||||
void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
|
||||
btrfs_err(fs_info, "reached 32bit limit for logical addresses");
|
||||
btrfs_err(fs_info,
|
||||
"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
|
||||
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
|
||||
btrfs_err(fs_info,
|
||||
"please consider upgrading to 64bit kernel/hardware");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We only mark the transaction aborted and then set the file system read-only.
|
||||
* This will prevent new transactions from starting or trying to join this
|
||||
|
@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
|
||||
BTRFS_ATTR(static_feature, supported_rescue_options,
|
||||
supported_rescue_options_show);
|
||||
|
||||
static ssize_t supported_sectorsizes_show(struct kobject *kobj,
|
||||
struct kobj_attribute *a,
|
||||
char *buf)
|
||||
{
|
||||
ssize_t ret = 0;
|
||||
|
||||
/* Only sectorsize == PAGE_SIZE is now supported */
|
||||
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
|
||||
|
||||
return ret;
|
||||
}
|
||||
BTRFS_ATTR(static_feature, supported_sectorsizes,
|
||||
supported_sectorsizes_show);
|
||||
|
||||
static struct attribute *btrfs_supported_static_feature_attrs[] = {
|
||||
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
|
||||
BTRFS_ATTR_PTR(static_feature, supported_checksums),
|
||||
BTRFS_ATTR_PTR(static_feature, send_stream_version),
|
||||
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
|
||||
BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
||||
}
|
||||
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
|
||||
|
||||
static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
|
||||
struct kobj_attribute *a,
|
||||
char *buf)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
|
||||
ssize_t ret;
|
||||
|
||||
ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
|
||||
struct kobj_attribute *a,
|
||||
const char *buf, size_t len)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
|
||||
int thresh;
|
||||
int ret;
|
||||
|
||||
ret = kstrtoint(buf, 10, &thresh);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (thresh <= 50 || thresh > 100)
|
||||
return -EINVAL;
|
||||
|
||||
fs_info->bg_reclaim_threshold = thresh;
|
||||
|
||||
return len;
|
||||
}
|
||||
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
|
||||
btrfs_bg_reclaim_threshold_store);
|
||||
|
||||
static const struct attribute *btrfs_attrs[] = {
|
||||
BTRFS_ATTR_PTR(, label),
|
||||
BTRFS_ATTR_PTR(, nodesize),
|
||||
@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = {
|
||||
BTRFS_ATTR_PTR(, exclusive_operation),
|
||||
BTRFS_ATTR_PTR(, generation),
|
||||
BTRFS_ATTR_PTR(, read_policy),
|
||||
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
|
||||
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
struct btrfs_transaction *cur_trans = trans->transaction;
|
||||
|
||||
if (!trans->chunk_bytes_reserved)
|
||||
return;
|
||||
@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
|
||||
|
||||
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
|
||||
trans->chunk_bytes_reserved, NULL);
|
||||
atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
|
||||
cond_wake_up(&cur_trans->chunk_reserve_wait);
|
||||
trans->chunk_bytes_reserved = 0;
|
||||
}
|
||||
|
||||
@ -383,6 +386,8 @@ loop:
|
||||
spin_lock_init(&cur_trans->dropped_roots_lock);
|
||||
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
|
||||
spin_lock_init(&cur_trans->releasing_ebs_lock);
|
||||
atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
|
||||
init_waitqueue_head(&cur_trans->chunk_reserve_wait);
|
||||
list_add_tail(&cur_trans->list, &fs_info->trans_list);
|
||||
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
|
||||
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
|
||||
@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
|
||||
int force)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
int ret = 0;
|
||||
|
||||
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
|
||||
root->last_trans < trans->transid) || force) {
|
||||
@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
|
||||
* lock. smp_wmb() makes sure that all the writes above are
|
||||
* done before we pop in the zero below
|
||||
*/
|
||||
btrfs_init_reloc_root(trans, root);
|
||||
ret = btrfs_init_reloc_root(trans, root);
|
||||
smp_mb__before_atomic();
|
||||
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
|
||||
}
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
int ret;
|
||||
|
||||
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
|
||||
return 0;
|
||||
@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
|
||||
return 0;
|
||||
|
||||
mutex_lock(&fs_info->reloc_mutex);
|
||||
record_root_in_trans(trans, root, 0);
|
||||
ret = record_root_in_trans(trans, root, 0);
|
||||
mutex_unlock(&fs_info->reloc_mutex);
|
||||
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
|
||||
@ -741,7 +748,16 @@ got_it:
|
||||
* Thus it need to be called after current->journal_info initialized,
|
||||
* or we can deadlock.
|
||||
*/
|
||||
btrfs_record_root_in_trans(h, root);
|
||||
ret = btrfs_record_root_in_trans(h, root);
|
||||
if (ret) {
|
||||
/*
|
||||
* The transaction handle is fully initialized and linked with
|
||||
* other structures so it needs to be ended in case of errors,
|
||||
* not just freed.
|
||||
*/
|
||||
btrfs_end_transaction(h);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return h;
|
||||
|
||||
@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
|
||||
spin_unlock(&fs_info->fs_roots_radix_lock);
|
||||
|
||||
btrfs_free_log(trans, root);
|
||||
btrfs_update_reloc_root(trans, root);
|
||||
ret2 = btrfs_update_reloc_root(trans, root);
|
||||
if (ret2)
|
||||
return ret2;
|
||||
|
||||
/* see comments in should_cow_block() */
|
||||
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
|
||||
@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
|
||||
* recorded root will never be updated again, causing an outdated root
|
||||
* item.
|
||||
*/
|
||||
record_root_in_trans(trans, src, 1);
|
||||
ret = record_root_in_trans(trans, src, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* btrfs_qgroup_inherit relies on a consistent view of the usage for the
|
||||
@ -1509,7 +1529,7 @@ out:
|
||||
* insert_dir_item()
|
||||
*/
|
||||
if (!ret)
|
||||
record_root_in_trans(trans, parent, 1);
|
||||
ret = record_root_in_trans(trans, parent, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
|
||||
dentry = pending->dentry;
|
||||
parent_inode = pending->dir;
|
||||
parent_root = BTRFS_I(parent_inode)->root;
|
||||
record_root_in_trans(trans, parent_root, 0);
|
||||
|
||||
ret = record_root_in_trans(trans, parent_root, 0);
|
||||
if (ret)
|
||||
goto fail;
|
||||
cur_time = current_time(parent_inode);
|
||||
|
||||
/*
|
||||
@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
|
||||
goto fail;
|
||||
}
|
||||
|
||||
record_root_in_trans(trans, root, 0);
|
||||
ret = record_root_in_trans(trans, root, 0);
|
||||
if (ret) {
|
||||
btrfs_abort_transaction(trans, ret);
|
||||
goto fail;
|
||||
}
|
||||
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
|
||||
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
|
||||
btrfs_check_and_init_root_item(new_root_item);
|
||||
@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
|
||||
*/
|
||||
BUG_ON(list_empty(&cur_trans->list));
|
||||
|
||||
list_del_init(&cur_trans->list);
|
||||
if (cur_trans == fs_info->running_transaction) {
|
||||
cur_trans->state = TRANS_STATE_COMMIT_DOING;
|
||||
spin_unlock(&fs_info->trans_lock);
|
||||
@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
|
||||
|
||||
spin_lock(&fs_info->trans_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now that we know no one else is still using the transaction we can
|
||||
* remove the transaction from the list of transactions. This avoids
|
||||
* the transaction kthread from cleaning up the transaction while some
|
||||
* other task is still using it, which could result in a use-after-free
|
||||
* on things like log trees, as it forces the transaction kthread to
|
||||
* wait for this transaction to be cleaned up by us.
|
||||
*/
|
||||
list_del_init(&cur_trans->list);
|
||||
|
||||
spin_unlock(&fs_info->trans_lock);
|
||||
|
||||
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
|
||||
|
@ -96,6 +96,13 @@ struct btrfs_transaction {
|
||||
|
||||
spinlock_t releasing_ebs_lock;
|
||||
struct list_head releasing_ebs;
|
||||
|
||||
/*
|
||||
* The number of bytes currently reserved, by all transaction handles
|
||||
* attached to this transaction, for metadata extents of the chunk tree.
|
||||
*/
|
||||
atomic64_t chunk_bytes_reserved;
|
||||
wait_queue_head_t chunk_reserve_wait;
|
||||
};
|
||||
|
||||
#define __TRANS_FREEZABLE (1U << 0)
|
||||
@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
|
||||
spin_lock(&inode->lock);
|
||||
inode->last_trans = trans->transaction->transid;
|
||||
inode->last_sub_trans = inode->root->log_transid;
|
||||
inode->last_log_commit = inode->root->last_log_commit;
|
||||
inode->last_log_commit = inode->last_sub_trans - 1;
|
||||
spin_unlock(&inode->lock);
|
||||
}
|
||||
|
||||
|
@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf,
|
||||
key->offset, fs_info->sectorsize);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
|
||||
extent_err(leaf, slot,
|
||||
"invalid extent flag, data has full backref set");
|
||||
return -EUCLEAN;
|
||||
}
|
||||
}
|
||||
ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
|
||||
|
||||
|
@ -3165,20 +3165,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
|
||||
*/
|
||||
mutex_unlock(&root->log_mutex);
|
||||
|
||||
btrfs_init_log_ctx(&root_log_ctx, NULL);
|
||||
|
||||
mutex_lock(&log_root_tree->log_mutex);
|
||||
|
||||
if (btrfs_is_zoned(fs_info)) {
|
||||
mutex_lock(&fs_info->tree_root->log_mutex);
|
||||
if (!log_root_tree->node) {
|
||||
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
|
||||
if (ret) {
|
||||
mutex_unlock(&log_root_tree->log_mutex);
|
||||
mutex_unlock(&fs_info->tree_log_mutex);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&fs_info->tree_root->log_mutex);
|
||||
}
|
||||
|
||||
btrfs_init_log_ctx(&root_log_ctx, NULL);
|
||||
|
||||
mutex_lock(&log_root_tree->log_mutex);
|
||||
|
||||
index2 = log_root_tree->log_transid % 2;
|
||||
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
|
||||
root_log_ctx.log_transid = log_root_tree->log_transid;
|
||||
@ -6278,7 +6280,12 @@ again:
|
||||
}
|
||||
|
||||
wc.replay_dest->log_root = log;
|
||||
btrfs_record_root_in_trans(trans, wc.replay_dest);
|
||||
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
|
||||
if (ret)
|
||||
/* The loop needs to continue due to the root refs */
|
||||
btrfs_handle_fs_error(fs_info, ret,
|
||||
"failed to record the log root in transaction");
|
||||
else
|
||||
ret = walk_log_tree(trans, log, &wc);
|
||||
|
||||
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
|
||||
|
929
fs/btrfs/tree-mod-log.c
Normal file
929
fs/btrfs/tree-mod-log.c
Normal file
@ -0,0 +1,929 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "tree-mod-log.h"
|
||||
#include "disk-io.h"
|
||||
|
||||
struct tree_mod_root {
|
||||
u64 logical;
|
||||
u8 level;
|
||||
};
|
||||
|
||||
struct tree_mod_elem {
|
||||
struct rb_node node;
|
||||
u64 logical;
|
||||
u64 seq;
|
||||
enum btrfs_mod_log_op op;
|
||||
|
||||
/*
|
||||
* This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
|
||||
* operations.
|
||||
*/
|
||||
int slot;
|
||||
|
||||
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
|
||||
u64 generation;
|
||||
|
||||
/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
|
||||
struct btrfs_disk_key key;
|
||||
u64 blockptr;
|
||||
|
||||
/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
|
||||
struct {
|
||||
int dst_slot;
|
||||
int nr_items;
|
||||
} move;
|
||||
|
||||
/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
|
||||
struct tree_mod_root old_root;
|
||||
};
|
||||
|
||||
/*
|
||||
* Pull a new tree mod seq number for our operation.
|
||||
*/
|
||||
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
return atomic64_inc_return(&fs_info->tree_mod_seq);
|
||||
}
|
||||
|
||||
/*
|
||||
* This adds a new blocker to the tree mod log's blocker list if the @elem
|
||||
* passed does not already have a sequence number set. So when a caller expects
|
||||
* to record tree modifications, it should ensure to set elem->seq to zero
|
||||
* before calling btrfs_get_tree_mod_seq.
|
||||
* Returns a fresh, unused tree log modification sequence number, even if no new
|
||||
* blocker was added.
|
||||
*/
|
||||
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_seq_list *elem)
|
||||
{
|
||||
write_lock(&fs_info->tree_mod_log_lock);
|
||||
if (!elem->seq) {
|
||||
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
|
||||
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
|
||||
set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
|
||||
}
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
|
||||
return elem->seq;
|
||||
}
|
||||
|
||||
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_seq_list *elem)
|
||||
{
|
||||
struct rb_root *tm_root;
|
||||
struct rb_node *node;
|
||||
struct rb_node *next;
|
||||
struct tree_mod_elem *tm;
|
||||
u64 min_seq = BTRFS_SEQ_LAST;
|
||||
u64 seq_putting = elem->seq;
|
||||
|
||||
if (!seq_putting)
|
||||
return;
|
||||
|
||||
write_lock(&fs_info->tree_mod_log_lock);
|
||||
list_del(&elem->list);
|
||||
elem->seq = 0;
|
||||
|
||||
if (list_empty(&fs_info->tree_mod_seq_list)) {
|
||||
clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
|
||||
} else {
|
||||
struct btrfs_seq_list *first;
|
||||
|
||||
first = list_first_entry(&fs_info->tree_mod_seq_list,
|
||||
struct btrfs_seq_list, list);
|
||||
if (seq_putting > first->seq) {
|
||||
/*
|
||||
* Blocker with lower sequence number exists, we cannot
|
||||
* remove anything from the log.
|
||||
*/
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
return;
|
||||
}
|
||||
min_seq = first->seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Anything that's lower than the lowest existing (read: blocked)
|
||||
* sequence number can be removed from the tree.
|
||||
*/
|
||||
tm_root = &fs_info->tree_mod_log;
|
||||
for (node = rb_first(tm_root); node; node = next) {
|
||||
next = rb_next(node);
|
||||
tm = rb_entry(node, struct tree_mod_elem, node);
|
||||
if (tm->seq >= min_seq)
|
||||
continue;
|
||||
rb_erase(node, tm_root);
|
||||
kfree(tm);
|
||||
}
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Key order of the log:
|
||||
* node/leaf start address -> sequence
|
||||
*
|
||||
* The 'start address' is the logical address of the *new* root node for root
|
||||
* replace operations, or the logical address of the affected block for all
|
||||
* other operations.
|
||||
*/
|
||||
static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
|
||||
struct tree_mod_elem *tm)
|
||||
{
|
||||
struct rb_root *tm_root;
|
||||
struct rb_node **new;
|
||||
struct rb_node *parent = NULL;
|
||||
struct tree_mod_elem *cur;
|
||||
|
||||
lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
|
||||
|
||||
tm->seq = btrfs_inc_tree_mod_seq(fs_info);
|
||||
|
||||
tm_root = &fs_info->tree_mod_log;
|
||||
new = &tm_root->rb_node;
|
||||
while (*new) {
|
||||
cur = rb_entry(*new, struct tree_mod_elem, node);
|
||||
parent = *new;
|
||||
if (cur->logical < tm->logical)
|
||||
new = &((*new)->rb_left);
|
||||
else if (cur->logical > tm->logical)
|
||||
new = &((*new)->rb_right);
|
||||
else if (cur->seq < tm->seq)
|
||||
new = &((*new)->rb_left);
|
||||
else if (cur->seq > tm->seq)
|
||||
new = &((*new)->rb_right);
|
||||
else
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
rb_link_node(&tm->node, parent, new);
|
||||
rb_insert_color(&tm->node, tm_root);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
|
||||
* returns false with the tree_mod_log_lock acquired. The caller must hold
|
||||
* this until all tree mod log insertions are recorded in the rb tree and then
|
||||
* write unlock fs_info::tree_mod_log_lock.
|
||||
*/
|
||||
static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb)
|
||||
{
|
||||
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
|
||||
return true;
|
||||
if (eb && btrfs_header_level(eb) == 0)
|
||||
return true;
|
||||
|
||||
write_lock(&fs_info->tree_mod_log_lock);
|
||||
if (list_empty(&(fs_info)->tree_mod_seq_list)) {
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
|
||||
static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb)
|
||||
{
|
||||
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
|
||||
return false;
|
||||
if (eb && btrfs_header_level(eb) == 0)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
|
||||
int slot,
|
||||
enum btrfs_mod_log_op op,
|
||||
gfp_t flags)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
|
||||
tm = kzalloc(sizeof(*tm), flags);
|
||||
if (!tm)
|
||||
return NULL;
|
||||
|
||||
tm->logical = eb->start;
|
||||
if (op != BTRFS_MOD_LOG_KEY_ADD) {
|
||||
btrfs_node_key(eb, &tm->key, slot);
|
||||
tm->blockptr = btrfs_node_blockptr(eb, slot);
|
||||
}
|
||||
tm->op = op;
|
||||
tm->slot = slot;
|
||||
tm->generation = btrfs_node_ptr_generation(eb, slot);
|
||||
RB_CLEAR_NODE(&tm->node);
|
||||
|
||||
return tm;
|
||||
}
|
||||
|
||||
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
|
||||
enum btrfs_mod_log_op op, gfp_t flags)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
int ret;
|
||||
|
||||
if (!tree_mod_need_log(eb->fs_info, eb))
|
||||
return 0;
|
||||
|
||||
tm = alloc_tree_mod_elem(eb, slot, op, flags);
|
||||
if (!tm)
|
||||
return -ENOMEM;
|
||||
|
||||
if (tree_mod_dont_log(eb->fs_info, eb)) {
|
||||
kfree(tm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = tree_mod_log_insert(eb->fs_info, tm);
|
||||
write_unlock(&eb->fs_info->tree_mod_log_lock);
|
||||
if (ret)
|
||||
kfree(tm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
|
||||
int dst_slot, int src_slot,
|
||||
int nr_items)
|
||||
{
|
||||
struct tree_mod_elem *tm = NULL;
|
||||
struct tree_mod_elem **tm_list = NULL;
|
||||
int ret = 0;
|
||||
int i;
|
||||
bool locked = false;
|
||||
|
||||
if (!tree_mod_need_log(eb->fs_info, eb))
|
||||
return 0;
|
||||
|
||||
tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
|
||||
if (!tm_list)
|
||||
return -ENOMEM;
|
||||
|
||||
tm = kzalloc(sizeof(*tm), GFP_NOFS);
|
||||
if (!tm) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
|
||||
tm->logical = eb->start;
|
||||
tm->slot = src_slot;
|
||||
tm->move.dst_slot = dst_slot;
|
||||
tm->move.nr_items = nr_items;
|
||||
tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
|
||||
|
||||
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
|
||||
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
|
||||
if (!tm_list[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
}
|
||||
|
||||
if (tree_mod_dont_log(eb->fs_info, eb))
|
||||
goto free_tms;
|
||||
locked = true;
|
||||
|
||||
/*
|
||||
* When we override something during the move, we log these removals.
|
||||
* This can only happen when we move towards the beginning of the
|
||||
* buffer, i.e. dst_slot < src_slot.
|
||||
*/
|
||||
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
|
||||
ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
|
||||
if (ret)
|
||||
goto free_tms;
|
||||
}
|
||||
|
||||
ret = tree_mod_log_insert(eb->fs_info, tm);
|
||||
if (ret)
|
||||
goto free_tms;
|
||||
write_unlock(&eb->fs_info->tree_mod_log_lock);
|
||||
kfree(tm_list);
|
||||
|
||||
return 0;
|
||||
|
||||
free_tms:
|
||||
for (i = 0; i < nr_items; i++) {
|
||||
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
|
||||
rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
|
||||
kfree(tm_list[i]);
|
||||
}
|
||||
if (locked)
|
||||
write_unlock(&eb->fs_info->tree_mod_log_lock);
|
||||
kfree(tm_list);
|
||||
kfree(tm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
|
||||
struct tree_mod_elem **tm_list,
|
||||
int nritems)
|
||||
{
|
||||
int i, j;
|
||||
int ret;
|
||||
|
||||
for (i = nritems - 1; i >= 0; i--) {
|
||||
ret = tree_mod_log_insert(fs_info, tm_list[i]);
|
||||
if (ret) {
|
||||
for (j = nritems - 1; j > i; j--)
|
||||
rb_erase(&tm_list[j]->node,
|
||||
&fs_info->tree_mod_log);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
|
||||
struct extent_buffer *new_root,
|
||||
bool log_removal)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = old_root->fs_info;
|
||||
struct tree_mod_elem *tm = NULL;
|
||||
struct tree_mod_elem **tm_list = NULL;
|
||||
int nritems = 0;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
if (!tree_mod_need_log(fs_info, NULL))
|
||||
return 0;
|
||||
|
||||
if (log_removal && btrfs_header_level(old_root) > 0) {
|
||||
nritems = btrfs_header_nritems(old_root);
|
||||
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
|
||||
GFP_NOFS);
|
||||
if (!tm_list) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
for (i = 0; i < nritems; i++) {
|
||||
tm_list[i] = alloc_tree_mod_elem(old_root, i,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
|
||||
if (!tm_list[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tm = kzalloc(sizeof(*tm), GFP_NOFS);
|
||||
if (!tm) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
|
||||
tm->logical = new_root->start;
|
||||
tm->old_root.logical = old_root->start;
|
||||
tm->old_root.level = btrfs_header_level(old_root);
|
||||
tm->generation = btrfs_header_generation(old_root);
|
||||
tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
|
||||
|
||||
if (tree_mod_dont_log(fs_info, NULL))
|
||||
goto free_tms;
|
||||
|
||||
if (tm_list)
|
||||
ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
|
||||
if (!ret)
|
||||
ret = tree_mod_log_insert(fs_info, tm);
|
||||
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
if (ret)
|
||||
goto free_tms;
|
||||
kfree(tm_list);
|
||||
|
||||
return ret;
|
||||
|
||||
free_tms:
|
||||
if (tm_list) {
|
||||
for (i = 0; i < nritems; i++)
|
||||
kfree(tm_list[i]);
|
||||
kfree(tm_list);
|
||||
}
|
||||
kfree(tm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
|
||||
u64 start, u64 min_seq,
|
||||
bool smallest)
|
||||
{
|
||||
struct rb_root *tm_root;
|
||||
struct rb_node *node;
|
||||
struct tree_mod_elem *cur = NULL;
|
||||
struct tree_mod_elem *found = NULL;
|
||||
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
tm_root = &fs_info->tree_mod_log;
|
||||
node = tm_root->rb_node;
|
||||
while (node) {
|
||||
cur = rb_entry(node, struct tree_mod_elem, node);
|
||||
if (cur->logical < start) {
|
||||
node = node->rb_left;
|
||||
} else if (cur->logical > start) {
|
||||
node = node->rb_right;
|
||||
} else if (cur->seq < min_seq) {
|
||||
node = node->rb_left;
|
||||
} else if (!smallest) {
|
||||
/* We want the node with the highest seq */
|
||||
if (found)
|
||||
BUG_ON(found->seq > cur->seq);
|
||||
found = cur;
|
||||
node = node->rb_left;
|
||||
} else if (cur->seq > min_seq) {
|
||||
/* We want the node with the smallest seq */
|
||||
if (found)
|
||||
BUG_ON(found->seq < cur->seq);
|
||||
found = cur;
|
||||
node = node->rb_right;
|
||||
} else {
|
||||
found = cur;
|
||||
break;
|
||||
}
|
||||
}
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* This returns the element from the log with the smallest time sequence
|
||||
* value that's in the log (the oldest log item). Any element with a time
|
||||
* sequence lower than min_seq will be ignored.
|
||||
*/
|
||||
static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
|
||||
u64 start, u64 min_seq)
|
||||
{
|
||||
return __tree_mod_log_search(fs_info, start, min_seq, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* This returns the element from the log with the largest time sequence
|
||||
* value that's in the log (the most recent log item). Any element with
|
||||
* a time sequence lower than min_seq will be ignored.
|
||||
*/
|
||||
static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
|
||||
u64 start, u64 min_seq)
|
||||
{
|
||||
return __tree_mod_log_search(fs_info, start, min_seq, false);
|
||||
}
|
||||
|
||||
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
|
||||
struct extent_buffer *src,
|
||||
unsigned long dst_offset,
|
||||
unsigned long src_offset,
|
||||
int nr_items)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = dst->fs_info;
|
||||
int ret = 0;
|
||||
struct tree_mod_elem **tm_list = NULL;
|
||||
struct tree_mod_elem **tm_list_add, **tm_list_rem;
|
||||
int i;
|
||||
bool locked = false;
|
||||
|
||||
if (!tree_mod_need_log(fs_info, NULL))
|
||||
return 0;
|
||||
|
||||
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
|
||||
return 0;
|
||||
|
||||
tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
|
||||
GFP_NOFS);
|
||||
if (!tm_list)
|
||||
return -ENOMEM;
|
||||
|
||||
tm_list_add = tm_list;
|
||||
tm_list_rem = tm_list + nr_items;
|
||||
for (i = 0; i < nr_items; i++) {
|
||||
tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
|
||||
if (!tm_list_rem[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
|
||||
tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
|
||||
BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
|
||||
if (!tm_list_add[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
}
|
||||
|
||||
if (tree_mod_dont_log(fs_info, NULL))
|
||||
goto free_tms;
|
||||
locked = true;
|
||||
|
||||
for (i = 0; i < nr_items; i++) {
|
||||
ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
|
||||
if (ret)
|
||||
goto free_tms;
|
||||
ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
|
||||
if (ret)
|
||||
goto free_tms;
|
||||
}
|
||||
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
kfree(tm_list);
|
||||
|
||||
return 0;
|
||||
|
||||
free_tms:
|
||||
for (i = 0; i < nr_items * 2; i++) {
|
||||
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
|
||||
rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
|
||||
kfree(tm_list[i]);
|
||||
}
|
||||
if (locked)
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
kfree(tm_list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
|
||||
{
|
||||
struct tree_mod_elem **tm_list = NULL;
|
||||
int nritems = 0;
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
if (!tree_mod_need_log(eb->fs_info, eb))
|
||||
return 0;
|
||||
|
||||
nritems = btrfs_header_nritems(eb);
|
||||
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
|
||||
if (!tm_list)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < nritems; i++) {
|
||||
tm_list[i] = alloc_tree_mod_elem(eb, i,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
|
||||
if (!tm_list[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto free_tms;
|
||||
}
|
||||
}
|
||||
|
||||
if (tree_mod_dont_log(eb->fs_info, eb))
|
||||
goto free_tms;
|
||||
|
||||
ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
|
||||
write_unlock(&eb->fs_info->tree_mod_log_lock);
|
||||
if (ret)
|
||||
goto free_tms;
|
||||
kfree(tm_list);
|
||||
|
||||
return 0;
|
||||
|
||||
free_tms:
|
||||
for (i = 0; i < nritems; i++)
|
||||
kfree(tm_list[i]);
|
||||
kfree(tm_list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the logical address of the oldest predecessor of the given root.
|
||||
* Entries older than time_seq are ignored.
|
||||
*/
|
||||
static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
|
||||
u64 time_seq)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
struct tree_mod_elem *found = NULL;
|
||||
u64 root_logical = eb_root->start;
|
||||
bool looped = false;
|
||||
|
||||
if (!time_seq)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* The very last operation that's logged for a root is the replacement
|
||||
* operation (if it is replaced at all). This has the logical address
|
||||
* of the *new* root, making it the very first operation that's logged
|
||||
* for this root.
|
||||
*/
|
||||
while (1) {
|
||||
tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
|
||||
time_seq);
|
||||
if (!looped && !tm)
|
||||
return NULL;
|
||||
/*
|
||||
* If there are no tree operation for the oldest root, we simply
|
||||
* return it. This should only happen if that (old) root is at
|
||||
* level 0.
|
||||
*/
|
||||
if (!tm)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If there's an operation that's not a root replacement, we
|
||||
* found the oldest version of our root. Normally, we'll find a
|
||||
* BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
|
||||
*/
|
||||
if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
|
||||
break;
|
||||
|
||||
found = tm;
|
||||
root_logical = tm->old_root.logical;
|
||||
looped = true;
|
||||
}
|
||||
|
||||
/* If there's no old root to return, return what we found instead */
|
||||
if (!found)
|
||||
found = tm;
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* tm is a pointer to the first operation to rewind within eb. Then, all
|
||||
* previous operations will be rewound (until we reach something older than
|
||||
* time_seq).
|
||||
*/
|
||||
static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb,
|
||||
u64 time_seq,
|
||||
struct tree_mod_elem *first_tm)
|
||||
{
|
||||
u32 n;
|
||||
struct rb_node *next;
|
||||
struct tree_mod_elem *tm = first_tm;
|
||||
unsigned long o_dst;
|
||||
unsigned long o_src;
|
||||
unsigned long p_size = sizeof(struct btrfs_key_ptr);
|
||||
|
||||
n = btrfs_header_nritems(eb);
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
while (tm && tm->seq >= time_seq) {
|
||||
/*
|
||||
* All the operations are recorded with the operator used for
|
||||
* the modification. As we're going backwards, we do the
|
||||
* opposite of each operation here.
|
||||
*/
|
||||
switch (tm->op) {
|
||||
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
|
||||
BUG_ON(tm->slot < n);
|
||||
fallthrough;
|
||||
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
|
||||
case BTRFS_MOD_LOG_KEY_REMOVE:
|
||||
btrfs_set_node_key(eb, &tm->key, tm->slot);
|
||||
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
|
||||
btrfs_set_node_ptr_generation(eb, tm->slot,
|
||||
tm->generation);
|
||||
n++;
|
||||
break;
|
||||
case BTRFS_MOD_LOG_KEY_REPLACE:
|
||||
BUG_ON(tm->slot >= n);
|
||||
btrfs_set_node_key(eb, &tm->key, tm->slot);
|
||||
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
|
||||
btrfs_set_node_ptr_generation(eb, tm->slot,
|
||||
tm->generation);
|
||||
break;
|
||||
case BTRFS_MOD_LOG_KEY_ADD:
|
||||
/* if a move operation is needed it's in the log */
|
||||
n--;
|
||||
break;
|
||||
case BTRFS_MOD_LOG_MOVE_KEYS:
|
||||
o_dst = btrfs_node_key_ptr_offset(tm->slot);
|
||||
o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
|
||||
memmove_extent_buffer(eb, o_dst, o_src,
|
||||
tm->move.nr_items * p_size);
|
||||
break;
|
||||
case BTRFS_MOD_LOG_ROOT_REPLACE:
|
||||
/*
|
||||
* This operation is special. For roots, this must be
|
||||
* handled explicitly before rewinding.
|
||||
* For non-roots, this operation may exist if the node
|
||||
* was a root: root A -> child B; then A gets empty and
|
||||
* B is promoted to the new root. In the mod log, we'll
|
||||
* have a root-replace operation for B, a tree block
|
||||
* that is no root. We simply ignore that operation.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
next = rb_next(&tm->node);
|
||||
if (!next)
|
||||
break;
|
||||
tm = rb_entry(next, struct tree_mod_elem, node);
|
||||
if (tm->logical != first_tm->logical)
|
||||
break;
|
||||
}
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
btrfs_set_header_nritems(eb, n);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called with eb read locked. If the buffer cannot be rewound, the same buffer
|
||||
* is returned. If rewind operations happen, a fresh buffer is returned. The
|
||||
* returned buffer is always read-locked. If the returned buffer is not the
|
||||
* input buffer, the lock on the input buffer is released and the input buffer
|
||||
* is freed (its refcount is decremented).
|
||||
*/
|
||||
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_path *path,
|
||||
struct extent_buffer *eb,
|
||||
u64 time_seq)
|
||||
{
|
||||
struct extent_buffer *eb_rewin;
|
||||
struct tree_mod_elem *tm;
|
||||
|
||||
if (!time_seq)
|
||||
return eb;
|
||||
|
||||
if (btrfs_header_level(eb) == 0)
|
||||
return eb;
|
||||
|
||||
tm = tree_mod_log_search(fs_info, eb->start, time_seq);
|
||||
if (!tm)
|
||||
return eb;
|
||||
|
||||
if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
|
||||
BUG_ON(tm->slot != 0);
|
||||
eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
|
||||
if (!eb_rewin) {
|
||||
btrfs_tree_read_unlock(eb);
|
||||
free_extent_buffer(eb);
|
||||
return NULL;
|
||||
}
|
||||
btrfs_set_header_bytenr(eb_rewin, eb->start);
|
||||
btrfs_set_header_backref_rev(eb_rewin,
|
||||
btrfs_header_backref_rev(eb));
|
||||
btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
|
||||
btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
|
||||
} else {
|
||||
eb_rewin = btrfs_clone_extent_buffer(eb);
|
||||
if (!eb_rewin) {
|
||||
btrfs_tree_read_unlock(eb);
|
||||
free_extent_buffer(eb);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_tree_read_unlock(eb);
|
||||
free_extent_buffer(eb);
|
||||
|
||||
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
|
||||
eb_rewin, btrfs_header_level(eb_rewin));
|
||||
btrfs_tree_read_lock(eb_rewin);
|
||||
tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
|
||||
WARN_ON(btrfs_header_nritems(eb_rewin) >
|
||||
BTRFS_NODEPTRS_PER_BLOCK(fs_info));
|
||||
|
||||
return eb_rewin;
|
||||
}
|
||||
|
||||
/*
|
||||
* Rewind the state of @root's root node to the given @time_seq value.
|
||||
* If there are no changes, the current root->root_node is returned. If anything
|
||||
* changed in between, there's a fresh buffer allocated on which the rewind
|
||||
* operations are done. In any case, the returned buffer is read locked.
|
||||
* Returns NULL on error (with no locks held).
|
||||
*/
|
||||
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
struct tree_mod_elem *tm;
|
||||
struct extent_buffer *eb = NULL;
|
||||
struct extent_buffer *eb_root;
|
||||
u64 eb_root_owner = 0;
|
||||
struct extent_buffer *old;
|
||||
struct tree_mod_root *old_root = NULL;
|
||||
u64 old_generation = 0;
|
||||
u64 logical;
|
||||
int level;
|
||||
|
||||
eb_root = btrfs_read_lock_root_node(root);
|
||||
tm = tree_mod_log_oldest_root(eb_root, time_seq);
|
||||
if (!tm)
|
||||
return eb_root;
|
||||
|
||||
if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
|
||||
old_root = &tm->old_root;
|
||||
old_generation = tm->generation;
|
||||
logical = old_root->logical;
|
||||
level = old_root->level;
|
||||
} else {
|
||||
logical = eb_root->start;
|
||||
level = btrfs_header_level(eb_root);
|
||||
}
|
||||
|
||||
tm = tree_mod_log_search(fs_info, logical, time_seq);
|
||||
if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
|
||||
btrfs_tree_read_unlock(eb_root);
|
||||
free_extent_buffer(eb_root);
|
||||
old = read_tree_block(fs_info, logical, root->root_key.objectid,
|
||||
0, level, NULL);
|
||||
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
|
||||
if (!IS_ERR(old))
|
||||
free_extent_buffer(old);
|
||||
btrfs_warn(fs_info,
|
||||
"failed to read tree block %llu from get_old_root",
|
||||
logical);
|
||||
} else {
|
||||
struct tree_mod_elem *tm2;
|
||||
|
||||
btrfs_tree_read_lock(old);
|
||||
eb = btrfs_clone_extent_buffer(old);
|
||||
/*
|
||||
* After the lookup for the most recent tree mod operation
|
||||
* above and before we locked and cloned the extent buffer
|
||||
* 'old', a new tree mod log operation may have been added.
|
||||
* So lookup for a more recent one to make sure the number
|
||||
* of mod log operations we replay is consistent with the
|
||||
* number of items we have in the cloned extent buffer,
|
||||
* otherwise we can hit a BUG_ON when rewinding the extent
|
||||
* buffer.
|
||||
*/
|
||||
tm2 = tree_mod_log_search(fs_info, logical, time_seq);
|
||||
btrfs_tree_read_unlock(old);
|
||||
free_extent_buffer(old);
|
||||
ASSERT(tm2);
|
||||
ASSERT(tm2 == tm || tm2->seq > tm->seq);
|
||||
if (!tm2 || tm2->seq < tm->seq) {
|
||||
free_extent_buffer(eb);
|
||||
return NULL;
|
||||
}
|
||||
tm = tm2;
|
||||
}
|
||||
} else if (old_root) {
|
||||
eb_root_owner = btrfs_header_owner(eb_root);
|
||||
btrfs_tree_read_unlock(eb_root);
|
||||
free_extent_buffer(eb_root);
|
||||
eb = alloc_dummy_extent_buffer(fs_info, logical);
|
||||
} else {
|
||||
eb = btrfs_clone_extent_buffer(eb_root);
|
||||
btrfs_tree_read_unlock(eb_root);
|
||||
free_extent_buffer(eb_root);
|
||||
}
|
||||
|
||||
if (!eb)
|
||||
return NULL;
|
||||
if (old_root) {
|
||||
btrfs_set_header_bytenr(eb, eb->start);
|
||||
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
|
||||
btrfs_set_header_owner(eb, eb_root_owner);
|
||||
btrfs_set_header_level(eb, old_root->level);
|
||||
btrfs_set_header_generation(eb, old_generation);
|
||||
}
|
||||
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
|
||||
btrfs_header_level(eb));
|
||||
btrfs_tree_read_lock(eb);
|
||||
if (tm)
|
||||
tree_mod_log_rewind(fs_info, eb, time_seq, tm);
|
||||
else
|
||||
WARN_ON(btrfs_header_level(eb) != 0);
|
||||
WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
|
||||
|
||||
return eb;
|
||||
}
|
||||
|
||||
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
int level;
|
||||
struct extent_buffer *eb_root = btrfs_root_node(root);
|
||||
|
||||
tm = tree_mod_log_oldest_root(eb_root, time_seq);
|
||||
if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
|
||||
level = tm->old_root.level;
|
||||
else
|
||||
level = btrfs_header_level(eb_root);
|
||||
|
||||
free_extent_buffer(eb_root);
|
||||
|
||||
return level;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the lowest sequence number in the tree modification log.
|
||||
*
|
||||
* Return the sequence number of the oldest tree modification log user, which
|
||||
* corresponds to the lowest sequence number of all existing users. If there are
|
||||
* no users it returns 0.
|
||||
*/
|
||||
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
u64 ret = 0;
|
||||
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
if (!list_empty(&fs_info->tree_mod_seq_list)) {
|
||||
struct btrfs_seq_list *elem;
|
||||
|
||||
elem = list_first_entry(&fs_info->tree_mod_seq_list,
|
||||
struct btrfs_seq_list, list);
|
||||
ret = elem->seq;
|
||||
}
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
|
||||
return ret;
|
||||
}
|
53
fs/btrfs/tree-mod-log.h
Normal file
53
fs/btrfs/tree-mod-log.h
Normal file
@ -0,0 +1,53 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#ifndef BTRFS_TREE_MOD_LOG_H
|
||||
#define BTRFS_TREE_MOD_LOG_H
|
||||
|
||||
#include "ctree.h"
|
||||
|
||||
/* Represents a tree mod log user. */
|
||||
struct btrfs_seq_list {
|
||||
struct list_head list;
|
||||
u64 seq;
|
||||
};
|
||||
|
||||
#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
|
||||
#define BTRFS_SEQ_LAST ((u64)-1)
|
||||
|
||||
enum btrfs_mod_log_op {
|
||||
BTRFS_MOD_LOG_KEY_REPLACE,
|
||||
BTRFS_MOD_LOG_KEY_ADD,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING,
|
||||
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING,
|
||||
BTRFS_MOD_LOG_MOVE_KEYS,
|
||||
BTRFS_MOD_LOG_ROOT_REPLACE,
|
||||
};
|
||||
|
||||
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_seq_list *elem);
|
||||
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_seq_list *elem);
|
||||
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
|
||||
struct extent_buffer *new_root,
|
||||
bool log_removal);
|
||||
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
|
||||
enum btrfs_mod_log_op op, gfp_t flags);
|
||||
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
|
||||
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_path *path,
|
||||
struct extent_buffer *eb,
|
||||
u64 time_seq);
|
||||
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
|
||||
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
|
||||
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
|
||||
struct extent_buffer *src,
|
||||
unsigned long dst_offset,
|
||||
unsigned long src_offset,
|
||||
int nr_items);
|
||||
int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
|
||||
int dst_slot, int src_slot,
|
||||
int nr_items);
|
||||
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
|
||||
|
||||
#endif
|
@ -1458,8 +1458,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
|
||||
/* Given hole range was invalid (outside of device) */
|
||||
if (ret == -ERANGE) {
|
||||
*hole_start += *hole_size;
|
||||
*hole_size = 0;
|
||||
return 1;
|
||||
*hole_size = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
*hole_start += zone_size;
|
||||
@ -3098,11 +3098,12 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
|
||||
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
|
||||
{
|
||||
struct btrfs_root *root = fs_info->chunk_root;
|
||||
struct btrfs_trans_handle *trans;
|
||||
struct btrfs_block_group *block_group;
|
||||
u64 length;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
@ -3117,7 +3118,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
|
||||
* we release the path used to search the chunk/dev tree and before
|
||||
* the current task acquires this mutex and calls us.
|
||||
*/
|
||||
lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
|
||||
lockdep_assert_held(&fs_info->reclaim_bgs_lock);
|
||||
|
||||
/* step one, relocate all the extents inside this chunk */
|
||||
btrfs_scrub_pause(fs_info);
|
||||
@ -3130,8 +3131,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
|
||||
if (!block_group)
|
||||
return -ENOENT;
|
||||
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
|
||||
length = block_group->length;
|
||||
btrfs_put_block_group(block_group);
|
||||
|
||||
/*
|
||||
* On a zoned file system, discard the whole block group, this will
|
||||
* trigger a REQ_OP_ZONE_RESET operation on the device zone. If
|
||||
* resetting the zone fails, don't treat it as a fatal problem from the
|
||||
* filesystem's point of view.
|
||||
*/
|
||||
if (btrfs_is_zoned(fs_info)) {
|
||||
ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
|
||||
if (ret)
|
||||
btrfs_info(fs_info,
|
||||
"failed to reset zone %llu after relocation",
|
||||
chunk_offset);
|
||||
}
|
||||
|
||||
trans = btrfs_start_trans_remove_block_group(root->fs_info,
|
||||
chunk_offset);
|
||||
if (IS_ERR(trans)) {
|
||||
@ -3172,10 +3188,10 @@ again:
|
||||
key.type = BTRFS_CHUNK_ITEM_KEY;
|
||||
|
||||
while (1) {
|
||||
mutex_lock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_lock(&fs_info->reclaim_bgs_lock);
|
||||
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto error;
|
||||
}
|
||||
BUG_ON(ret == 0); /* Corruption */
|
||||
@ -3183,7 +3199,7 @@ again:
|
||||
ret = btrfs_previous_item(chunk_root, path, key.objectid,
|
||||
key.type);
|
||||
if (ret)
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
if (ret > 0)
|
||||
@ -3204,7 +3220,7 @@ again:
|
||||
else
|
||||
BUG_ON(ret);
|
||||
}
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
|
||||
if (found_key.offset == 0)
|
||||
break;
|
||||
@ -3744,10 +3760,10 @@ again:
|
||||
goto error;
|
||||
}
|
||||
|
||||
mutex_lock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_lock(&fs_info->reclaim_bgs_lock);
|
||||
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto error;
|
||||
}
|
||||
|
||||
@ -3761,7 +3777,7 @@ again:
|
||||
ret = btrfs_previous_item(chunk_root, path, 0,
|
||||
BTRFS_CHUNK_ITEM_KEY);
|
||||
if (ret) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
@ -3771,7 +3787,7 @@ again:
|
||||
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
||||
|
||||
if (found_key.objectid != key.objectid) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -3788,12 +3804,12 @@ again:
|
||||
|
||||
btrfs_release_path(path);
|
||||
if (!ret) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto loop;
|
||||
}
|
||||
|
||||
if (counting) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
spin_lock(&fs_info->balance_lock);
|
||||
bctl->stat.expected++;
|
||||
spin_unlock(&fs_info->balance_lock);
|
||||
@ -3818,7 +3834,7 @@ again:
|
||||
count_meta < bctl->meta.limit_min)
|
||||
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
|
||||
count_sys < bctl->sys.limit_min)) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto loop;
|
||||
}
|
||||
|
||||
@ -3832,7 +3848,7 @@ again:
|
||||
ret = btrfs_may_alloc_data_chunk(fs_info,
|
||||
found_key.offset);
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto error;
|
||||
} else if (ret == 1) {
|
||||
chunk_reserved = 1;
|
||||
@ -3840,7 +3856,7 @@ again:
|
||||
}
|
||||
|
||||
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
if (ret == -ENOSPC) {
|
||||
enospc_errors++;
|
||||
} else if (ret == -ETXTBSY) {
|
||||
@ -4725,16 +4741,16 @@ again:
|
||||
key.type = BTRFS_DEV_EXTENT_KEY;
|
||||
|
||||
do {
|
||||
mutex_lock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_lock(&fs_info->reclaim_bgs_lock);
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto done;
|
||||
}
|
||||
|
||||
ret = btrfs_previous_item(root, path, 0, key.type);
|
||||
if (ret) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
if (ret < 0)
|
||||
goto done;
|
||||
ret = 0;
|
||||
@ -4747,7 +4763,7 @@ again:
|
||||
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
|
||||
|
||||
if (key.objectid != device->devid) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
btrfs_release_path(path);
|
||||
break;
|
||||
}
|
||||
@ -4756,7 +4772,7 @@ again:
|
||||
length = btrfs_dev_extent_length(l, dev_extent);
|
||||
|
||||
if (key.offset + length <= new_size) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
btrfs_release_path(path);
|
||||
break;
|
||||
}
|
||||
@ -4772,12 +4788,12 @@ again:
|
||||
*/
|
||||
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
goto done;
|
||||
}
|
||||
|
||||
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
if (ret == -ENOSPC) {
|
||||
failed++;
|
||||
} else if (ret) {
|
||||
@ -4989,6 +5005,8 @@ static void init_alloc_chunk_ctl_policy_zoned(
|
||||
ctl->max_chunk_size = 2 * ctl->max_stripe_size;
|
||||
ctl->devs_max = min_t(int, ctl->devs_max,
|
||||
BTRFS_MAX_DEVS_SYS_CHUNK);
|
||||
} else {
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* We don't want a chunk larger than 10% of writable space */
|
||||
@ -6787,6 +6805,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
|
||||
return div_u64(chunk_len, data_stripes);
|
||||
}
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
/*
|
||||
* Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
|
||||
* can't be accessed on 32bit systems.
|
||||
*
|
||||
* This function do mount time check to reject the fs if it already has
|
||||
* metadata chunk beyond that limit.
|
||||
*/
|
||||
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
|
||||
u64 logical, u64 length, u64 type)
|
||||
{
|
||||
if (!(type & BTRFS_BLOCK_GROUP_METADATA))
|
||||
return 0;
|
||||
|
||||
if (logical + length < MAX_LFS_FILESIZE)
|
||||
return 0;
|
||||
|
||||
btrfs_err_32bit_limit(fs_info);
|
||||
return -EOVERFLOW;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is to give early warning for any metadata chunk reaching
|
||||
* BTRFS_32BIT_EARLY_WARN_THRESHOLD.
|
||||
* Although we can still access the metadata, it's not going to be possible
|
||||
* once the limit is reached.
|
||||
*/
|
||||
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
|
||||
u64 logical, u64 length, u64 type)
|
||||
{
|
||||
if (!(type & BTRFS_BLOCK_GROUP_METADATA))
|
||||
return;
|
||||
|
||||
if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
|
||||
return;
|
||||
|
||||
btrfs_warn_32bit_limit(fs_info);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
|
||||
struct btrfs_chunk *chunk)
|
||||
{
|
||||
@ -6797,6 +6855,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
|
||||
u64 logical;
|
||||
u64 length;
|
||||
u64 devid;
|
||||
u64 type;
|
||||
u8 uuid[BTRFS_UUID_SIZE];
|
||||
int num_stripes;
|
||||
int ret;
|
||||
@ -6804,8 +6863,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
|
||||
|
||||
logical = key->offset;
|
||||
length = btrfs_chunk_length(leaf, chunk);
|
||||
type = btrfs_chunk_type(leaf, chunk);
|
||||
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
ret = check_32bit_meta_chunk(fs_info, logical, length, type);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
warn_32bit_meta_chunk(fs_info, logical, length, type);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Only need to verify chunk item if we're reading from sys chunk array,
|
||||
* as chunk item in tree block is already verified by tree-checker.
|
||||
@ -6849,10 +6916,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
|
||||
map->io_width = btrfs_chunk_io_width(leaf, chunk);
|
||||
map->io_align = btrfs_chunk_io_align(leaf, chunk);
|
||||
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
|
||||
map->type = btrfs_chunk_type(leaf, chunk);
|
||||
map->type = type;
|
||||
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
|
||||
map->verified_stripes = 0;
|
||||
em->orig_block_len = calc_stripe_length(map->type, em->len,
|
||||
em->orig_block_len = calc_stripe_length(type, em->len,
|
||||
map->num_stripes);
|
||||
for (i = 0; i < num_stripes; i++) {
|
||||
map->stripes[i].physical =
|
||||
@ -8001,7 +8068,7 @@ static int relocating_repair_kthread(void *data)
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
mutex_lock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_lock(&fs_info->reclaim_bgs_lock);
|
||||
|
||||
/* Ensure block group still exists */
|
||||
cache = btrfs_lookup_block_group(fs_info, target);
|
||||
@ -8023,7 +8090,7 @@ static int relocating_repair_kthread(void *data)
|
||||
out:
|
||||
if (cache)
|
||||
btrfs_put_block_group(cache);
|
||||
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
||||
mutex_unlock(&fs_info->reclaim_bgs_lock);
|
||||
btrfs_exclop_finish(fs_info);
|
||||
|
||||
return ret;
|
||||
|
@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
|
||||
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
|
||||
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_uuid_scan_kthread(void *data);
|
||||
|
@ -342,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
|
||||
if (!IS_ALIGNED(nr_sectors, zone_sectors))
|
||||
zone_info->nr_zones++;
|
||||
|
||||
if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
|
||||
btrfs_err(fs_info, "zoned: device %pg does not support zone append",
|
||||
bdev);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
|
||||
if (!zone_info->seq_zones) {
|
||||
ret = -ENOMEM;
|
||||
|
@ -9,6 +9,12 @@
|
||||
#include "disk-io.h"
|
||||
#include "block-group.h"
|
||||
|
||||
/*
|
||||
* Block groups with more than this value (percents) of unusable space will be
|
||||
* scheduled for background reclaim.
|
||||
*/
|
||||
#define BTRFS_DEFAULT_RECLAIM_THRESH 75
|
||||
|
||||
struct btrfs_zoned_device_info {
|
||||
/*
|
||||
* Number of zones, zone size and types of zones if bdev is a
|
||||
|
@ -981,6 +981,15 @@ static inline unsigned int readahead_count(struct readahead_control *rac)
|
||||
return rac->_nr_pages;
|
||||
}
|
||||
|
||||
/**
|
||||
* readahead_batch_length - The number of bytes in the current batch.
|
||||
* @rac: The readahead request.
|
||||
*/
|
||||
static inline loff_t readahead_batch_length(struct readahead_control *rac)
|
||||
{
|
||||
return rac->_batch_count * PAGE_SIZE;
|
||||
}
|
||||
|
||||
static inline unsigned long dir_pages(struct inode *inode)
|
||||
{
|
||||
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
|
||||
|
@ -1903,6 +1903,18 @@ DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
|
||||
TP_ARGS(bg_cache)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group,
|
||||
TP_PROTO(const struct btrfs_block_group *bg_cache),
|
||||
|
||||
TP_ARGS(bg_cache)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group,
|
||||
TP_PROTO(const struct btrfs_block_group *bg_cache),
|
||||
|
||||
TP_ARGS(bg_cache)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
|
||||
TP_PROTO(const struct btrfs_block_group *bg_cache),
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user