2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-12-17 09:43:59 +08:00

for-5.13-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmCHGMYACgkQxWXV+ddt
 WDsFeA/+MZ+5UiYYucH5RVw/VExOQzSvlRVxxnOeR2s8V/gFj/Ip7d9E9UezA+lX
 di2byKXPzfjL9+xoyqfEZLpPgDQrhHTIQCzuNSvzMBykJIL6Sf1OZTtUZWU3HDH7
 S51UZtghgTPzeOhxsiBHqSFo9danT0w3KQhliE10Ur855ziKSvL2Tb7dvM6q5TS7
 mFTAj/Y2aanDaFKQjjBzzA+GZ0LFIGuErg1PADmF5XjbyY06ho1xqQ1A0t2/XL9x
 UpHdRP3E5XRAMl4uyYOUtbvUB1cROzoS6ySHJOJ9Bbz+IC0cLf5xTJkLE25bGkSi
 GjFNvnQOha1s8oMIlqkw64hKQqwp+gu2iZ7m1o76Z31k7CpLAC+rg11gbpODRuoh
 7B3EzowKyVihMHF8URAdC3A+9gbpPvyuGKDSy07yULh/2vas6dEzR3cPVEU0yXyJ
 3DO1ds0lVY3B/T9LKPQ785hQ7VdpgZ8BdIOVRtjgV2QQEa9eFh9VzybQjU8yBXGd
 vflBe8kQfASIZ5E0rcUGPUVIJoesM8U1pSlx9jvvTQVkOC/DQjtBx/5ePCL2iVfd
 izY8uWlCdguF/P1CYFf1M0auASSzl3bip1NnSMVvZ90dgDEK4XaIyd16kMGDCbU2
 UMOePMsLDApWcCVTqM/J+lFLa7rajRccdKby7F/zSpZIRgadPF8=
 =J5jh
 -----END PGP SIGNATURE-----

Merge tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "The updates this time are mostly stabilization, preparation and minor
  improvements.

  User visible improvements:

   - readahead for send, improving run time of full send by 10% and for
     incremental by 25%

   - make reflinks respect O_SYNC, O_DSYNC and S_SYNC flags

   - export supported sectorsize values in sysfs (currently only page
     size, more once full subpage support lands)

   - more graceful errors and warnings on 32bit systems when logical
     addresses for metadata reach the limit posed by unsigned long in
     page::index
      - error: fail mount if there's a metadata block beyond the limit
      - error: new metadata block would be at unreachable address
      - warn when 5/8th of the limit is reached, for 4K page systems
        it's 10T, for 64K page it's 160T

   - zoned mode
      - relocated zones get reset at the end instead of discard
      - automatic background reclaim of zones that have 75%+ of unusable
        space, the threshold is tunable in sysfs

  Fixes:

   - fsync and tree mod log fixes

   - fix inefficient preemptive reclaim calculations

   - fix exhaustion of the system chunk array due to concurrent
     allocations

   - fix fallback to no compression when racing with remount

   - preemptive fix for dm-crypt on zoned device that does not properly
     advertise zoned support

  Core changes:

   - add inode lock to synchronize mmap and other block updates (eg.
     deduplication, fallocate, fsync)

   - kmap conversions to new kmap_local API

   - subpage support (continued)
      - new helpers for page state/extent buffer tracking
      - metadata changes now support read and write

   - error handling through out relocation call paths

   - many other cleanups and code simplifications"

* tag 'for-5.13-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (112 commits)
  btrfs: zoned: automatically reclaim zones
  btrfs: rename delete_unused_bgs_mutex to reclaim_bgs_lock
  btrfs: zoned: reset zones of relocated block groups
  btrfs: more graceful errors/warnings on 32bit systems when reaching limits
  btrfs: zoned: fix unpaired block group unfreeze during device replace
  btrfs: fix race when picking most recent mod log operation for an old root
  btrfs: fix metadata extent leak after failure to create subvolume
  btrfs: handle remount to no compress during compression
  btrfs: zoned: fail mount if the device does not support zone append
  btrfs: fix race between transaction aborts and fsyncs leading to use-after-free
  btrfs: introduce submit_eb_subpage() to submit a subpage metadata page
  btrfs: make lock_extent_buffer_for_io() to be subpage compatible
  btrfs: introduce write_one_subpage_eb() function
  btrfs: introduce end_bio_subpage_eb_writepage() function
  btrfs: check return value of btrfs_commit_transaction in relocation
  btrfs: do proper error handling in merge_reloc_roots
  btrfs: handle extent corruption with select_one_root properly
  btrfs: cleanup error handling in prepare_to_merge
  btrfs: do not panic in __add_reloc_root
  btrfs: handle __add_reloc_root failures in btrfs_recover_relocation
  ...
This commit is contained in:
Linus Torvalds 2021-04-26 13:48:02 -07:00
commit 55ba0fe059
46 changed files with 2964 additions and 1582 deletions

View File

@ -30,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o
subpage.o tree-mod-log.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

View File

@ -14,6 +14,7 @@
#include "delayed-ref.h"
#include "locking.h"
#include "misc.h"
#include "tree-mod-log.h"
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
if (time_seq == SEQ_LAST)
if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
if (time_seq == SEQ_LAST)
if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
if (time_seq == SEQ_LAST)
if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
else if (time_seq == SEQ_LAST)
else if (time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
if (time_seq == SEQ_LAST)
if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
* If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
* much like trans == NULL case, the difference only lies in it will not
* If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
* behave much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
*
@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
if (time_seq == SEQ_LAST)
if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
/*
@ -1217,9 +1218,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
time_seq != SEQ_LAST) {
time_seq != BTRFS_SEQ_LAST) {
#else
if (trans && time_seq != SEQ_LAST) {
if (trans && time_seq != BTRFS_SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
struct ulist_node *node;
struct seq_list elem = SEQ_LIST_INIT(elem);
struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
.root_objectid = root->root_key.objectid,
@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
}
if (trans)
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_get_tree_mod_seq(fs_info, &seq_elem);
else
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
tree_mod_seq_elem.seq, &refs,
seq_elem.seq, &refs,
&extent_item_pos, ignore_offset);
if (ret)
goto out;
@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
tree_mod_seq_elem.seq, &roots,
seq_elem.seq, &roots,
ignore_offset);
if (ret)
break;
@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
if (trans) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_put_tree_mod_seq(fs_info, &seq_elem);
btrfs_end_transaction(trans);
} else {
up_read(&fs_info->commit_root_sem);

View File

@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
*/
if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
return;
spin_lock(&fs_info->unused_bgs_lock);
@ -1462,12 +1462,12 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
int ret;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
return;
mutex_lock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->reclaim_bgs)) {
bg = list_first_entry(&fs_info->reclaim_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&bg->bg_list);
space_info = bg->space_info;
spin_unlock(&fs_info->unused_bgs_lock);
/* Don't race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&bg->lock);
if (bg->reserved || bg->pinned || bg->ro) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
*/
spin_unlock(&bg->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&bg->lock);
/* Get out fast, in case we're unmounting the filesystem */
if (btrfs_fs_closing(fs_info)) {
up_write(&space_info->groups_sem);
goto next;
}
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
bg->start, div_u64(bg->used * 100, bg->length));
trace_btrfs_reclaim_block_group(bg);
ret = btrfs_relocate_chunk(fs_info, bg->start);
if (ret)
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
next:
btrfs_put_block_group(bg);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
{
spin_lock(&fs_info->unused_bgs_lock);
if (!list_empty(&fs_info->reclaim_bgs))
queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
spin_unlock(&fs_info->unused_bgs_lock);
}
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
spin_lock(&fs_info->unused_bgs_lock);
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
trace_btrfs_add_reclaim_block_group(bg);
list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
@ -2267,16 +2358,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
struct btrfs_trans_handle *trans;
u64 alloc_flags;
int ret;
bool dirty_bg_running;
again:
do {
trans = btrfs_join_transaction(fs_info->extent_root);
if (IS_ERR(trans))
return PTR_ERR(trans);
dirty_bg_running = false;
/*
* we're not allowed to set block groups readonly after the dirty
* block groups cache has started writing. If it already started,
* back off and let this transaction commit
* We're not allowed to set block groups readonly after the dirty
* block group cache has started writing. If it already started,
* back off and let this transaction commit.
*/
mutex_lock(&fs_info->ro_block_group_mutex);
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
@ -2288,8 +2382,9 @@ again:
ret = btrfs_wait_for_commit(fs_info, transid);
if (ret)
return ret;
goto again;
dirty_bg_running = true;
}
} while (dirty_bg_running);
if (do_chunk_alloc) {
/*
@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
again:
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
/*
* If there's not available space for the chunk tree (system
* space) and there are other tasks that reserved space for
* creating a new system block group, wait for them to complete
* the creation of their system block group and release excess
* reserved space. We do this because:
*
* *) We can end up allocating more system chunks than necessary
* when there are multiple tasks that are concurrently
* allocating block groups, which can lead to exhaustion of
* the system array in the superblock;
*
* *) If we allocate extra and unnecessary system block groups,
* despite being empty for a long time, and possibly forever,
* they end not being added to the list of unused block groups
* because that typically happens only when deallocating the
* last extent from a block group - which never happens since
* we never allocate from them in the first place. The few
* exceptions are when mounting a filesystem or running scrub,
* which add unused block groups to the list of unused block
* groups, to be deleted by the cleaner kthread.
* And even when they are added to the list of unused block
* groups, it can take a long time until they get deleted,
* since the cleaner kthread might be sleeping or busy with
* other work (deleting subvolumes, running delayed iputs,
* defrag scheduling, etc);
*
* This is rare in practice, but can happen when too many tasks
* are allocating blocks groups in parallel (via fallocate())
* and before the one that reserved space for a new system block
* group finishes the block group creation and releases the space
* reserved in excess (at btrfs_create_pending_block_groups()),
* other tasks end up here and see free system space temporarily
* not enough for updating the chunk tree.
*
* We unlock the chunk mutex before waiting for such tasks and
* lock it again after the wait, otherwise we would deadlock.
* It is safe to do so because allocating a system chunk is the
* first thing done while allocating a new block group.
*/
if (reserved > trans->chunk_bytes_reserved) {
const u64 min_needed = reserved - thresh;
mutex_unlock(&fs_info->chunk_mutex);
wait_event(cur_trans->chunk_reserve_wait,
atomic64_read(&cur_trans->chunk_bytes_reserved) <=
min_needed);
mutex_lock(&fs_info->chunk_mutex);
goto again;
}
/*
* Ignore failure to create system chunk. We might end up not
@ -3315,9 +3464,11 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
if (!ret)
if (!ret) {
atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved += thresh;
}
}
}
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->reclaim_bgs)) {
block_group = list_first_entry(&info->reclaim_bgs,
struct btrfs_block_group,
bg_list);
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,

View File

@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);

View File

@ -220,6 +220,7 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
struct rw_semaphore i_mmap_lock;
struct inode vfs_inode;
};
@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
/*
* Called every time after doing a buffered, direct IO or memory mapped write.
*
* This is to ensure that if we write to a file that was previously fsynced in
* the current transaction, then try to fsync it again in the same transaction,
* we will know that there were changes in the file and that it needs to be
* logged.
*/
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
{
int ret = 0;
spin_lock(&inode->lock);
inode->last_sub_trans = inode->root->log_transid;
spin_unlock(&inode->lock);
}
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
bool ret = false;
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
inode->last_sub_trans <= inode->root->last_log_commit) {
/*
* After a ranged fsync we might have left some extent maps
* (that fall outside the fsync's range). So return false
* here if the list isn't empty, to make sure btrfs_log_inode()
* will be called and process those extent maps.
*/
smp_mb();
if (list_empty(&inode->extent_tree.modified_extents))
ret = 1;
}
inode->last_sub_trans <= inode->root->last_log_commit)
ret = true;
spin_unlock(&inode->lock);
return ret;
}

View File

@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->pagev);
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
/* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
if (block_ctx->datav[num_pages]) {
kunmap(block_ctx->pagev[num_pages]);
kunmap_local(block_ctx->datav[num_pages]);
block_ctx->datav[num_pages] = NULL;
}
if (block_ctx->pagev[num_pages]) {
@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
return block_ctx->len;
}
@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
unsigned int i = 0;
int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
struct bio_vec bvec;
@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
mapped_datav[i] = kmap(bvec.bv_page);
mapped_datav[i] = kmap_local_page(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
bio_for_each_segment(bvec, bio, iter)
kunmap(bvec.bv_page);
/* Unmap in reverse order */
for (--i; i >= 0; i--)
kunmap_local(mapped_datav[i]);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &

View File

@ -80,10 +80,15 @@ static int compression_compress_pages(int type, struct list_head *ws,
case BTRFS_COMPRESS_NONE:
default:
/*
* This can't happen, the type is validated several times
* before we get here. As a sane fallback, return what the
* callers will understand as 'no compression happened'.
* This can happen when compression races with remount setting
* it to 'no compress', while caller doesn't call
* inode_need_compress() to check if we really need to
* compress.
*
* Not a big deal, just need to inform caller that we
* haven't allocated any pages yet.
*/
*out_pages = 0;
return -E2BIG;
}
}
@ -1611,7 +1616,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
curr_sample_pos = 0;
while (index < index_end) {
page = find_get_page(inode->i_mapping, index);
in_data = kmap(page);
in_data = kmap_local_page(page);
/* Handle case where the start is not aligned to PAGE_SIZE */
i = start % PAGE_SIZE;
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
@ -1624,7 +1629,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
start += SAMPLING_INTERVAL;
curr_sample_pos += SAMPLING_READ_SIZE;
}
kunmap(page);
kunmap_local(in_data);
put_page(page);
index++;

File diff suppressed because it is too large Load Diff

View File

@ -342,6 +342,27 @@ struct btrfs_node {
struct btrfs_key_ptr ptrs[];
} __attribute__ ((__packed__));
/* Read ahead values for struct btrfs_path.reada */
enum {
READA_NONE,
READA_BACK,
READA_FORWARD,
/*
* Similar to READA_FORWARD but unlike it:
*
* 1) It will trigger readahead even for leaves that are not close to
* each other on disk;
* 2) It also triggers readahead for nodes;
* 3) During a search, even when a node or leaf is already in memory, it
* will still trigger readahead for other nodes and leaves that follow
* it.
*
* This is meant to be used only when we know we are iterating over the
* entire tree or a very large part of it.
*/
READA_FORWARD_ALWAYS,
};
/*
* btrfs_paths remember the path taken from the root down to the leaf.
* level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
@ -350,7 +371,6 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
enum { READA_NONE, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
@ -482,16 +502,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
/* delayed seq elem */
struct seq_list {
struct list_head list;
u64 seq;
};
#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
#define SEQ_LAST ((u64)-1)
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@ -572,6 +582,15 @@ enum {
/* Indicate that we can't trust the free space tree for caching yet */
BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
/* Indicate whether there are any tree modification log users */
BTRFS_FS_TREE_MOD_LOG_USERS,
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
BTRFS_FS_32BIT_WARN,
#endif
};
/*
@ -941,10 +960,16 @@ struct btrfs_fs_info {
struct work_struct async_data_reclaim_work;
struct work_struct preempt_reclaim_work;
/* Reclaim partially filled block groups in the background */
struct work_struct reclaim_bgs_work;
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
struct mutex delete_unused_bgs_mutex;
/* Protect block groups that are going to be deleted */
struct mutex reclaim_bgs_lock;
/* Cached block sizes */
u32 nodesize;
@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
/*
@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
/* tree mod log functions from ctree.c */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence, const char *name,
@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end, int mirror);
struct page *page, u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
#define BTRFS_ILOCK_TRY (1U << 1)
#define BTRFS_ILOCK_MMAP (1U << 2)
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
@ -3217,8 +3235,9 @@ extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_drop_extents_args *args);
int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_path *path, const u64 start,
const u64 end,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
@ -3405,6 +3424,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
#define ASSERT(expr) (void)(expr)
#endif
#if BITS_PER_LONG == 32
#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
/*
* The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
* addresses of extents.
*
* For 4K page size it's about 10T, for 64K it's 160T.
*/
#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
#endif
/*
* Get the correct offset inside the page of extent buffer.
*
@ -3732,8 +3764,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);

View File

@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
/*
* Since we're under a transaction reserve_metadata_bytes could
* try to commit the transaction which will make it return
* EAGAIN to make us stop the transaction we have, so return
* ENOSPC instead so that btrfs_dirty_inode knows what to do.
*/
if (ret == -EAGAIN) {
ret = -ENOSPC;
/* NO_FLUSH could only fail with -ENOSPC */
ASSERT(ret == 0 || ret == -ENOSPC);
if (ret)
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
}
if (!ret) {
node->bytes_reserved = num_bytes;
trace_btrfs_space_reservation(fs_info,
"delayed_inode",
btrfs_ino(inode),
num_bytes, 1);
} else {
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
}
return ret;
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
}
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
node->inode_id, num_bytes, 1);
node->bytes_reserved = num_bytes;
}
@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
inode_unlock_shared(inode);
inode_lock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
delayed_node);
ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
if (ret)
goto release_node;

View File

@ -11,6 +11,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
#include "tree-mod-log.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct seq_list *elem;
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
seq = elem->seq;
}
read_unlock(&fs_info->tree_mod_log_lock);
seq = btrfs_tree_mod_log_lowest_seq(fs_info);
again:
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
@ -517,23 +509,16 @@ again:
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
struct seq_list *elem;
int ret = 0;
u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
if (seq >= elem->seq) {
if (min_seq != 0 && seq >= min_seq) {
btrfs_debug(fs_info,
"holding back delayed_ref %#x.%x, lowest is %#x.%x",
(u32)(seq >> 32), (u32)seq,
(u32)(elem->seq >> 32), (u32)elem->seq);
"holding back delayed_ref %llu, lowest is %llu",
seq, min_seq);
ret = 1;
}
}
read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}

View File

@ -42,6 +42,7 @@
#include "discard.h"
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
return ret;
}
static int csum_one_extent_buffer(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u8 result[BTRFS_CSUM_SIZE];
int ret;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE) == 0);
csum_tree_block(eb, result);
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
else
ret = btrfs_check_leaf_full(eb);
if (ret < 0) {
btrfs_print_tree(eb, 0);
btrfs_err(fs_info,
"block=%llu write time tree block corruption detected",
eb->start);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
}
/* Checksum all dirty extent buffers in one bio_vec */
static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
struct bio_vec *bvec)
{
struct page *page = bvec->bv_page;
u64 bvec_start = page_offset(page) + bvec->bv_offset;
u64 cur;
int ret = 0;
for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
cur += fs_info->nodesize) {
struct extent_buffer *eb;
bool uptodate;
eb = find_extent_buffer(fs_info, cur);
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
/* A dirty eb shouldn't disappear from buffer_radix */
if (WARN_ON(!eb))
return -EUCLEAN;
if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
free_extent_buffer(eb);
return -EUCLEAN;
}
if (WARN_ON(!uptodate)) {
free_extent_buffer(eb);
return -EUCLEAN;
}
ret = csum_one_extent_buffer(eb);
free_extent_buffer(eb);
if (ret < 0)
return ret;
}
return ret;
}
/*
* Checksum a dirty tree block before IO. This has extra checks to make sure
* we only fill in the checksum field in the first page of a multi-page block.
@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
u8 result[BTRFS_CSUM_SIZE];
struct extent_buffer *eb;
int ret;
if (fs_info->sectorsize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE) == 0);
csum_tree_block(eb, result);
if (btrfs_header_level(eb))
ret = btrfs_check_node(eb);
else
ret = btrfs_check_leaf_full(eb);
if (ret < 0) {
btrfs_print_tree(eb, 0);
btrfs_err(fs_info,
"block=%llu write time tree block corruption detected",
eb->start);
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
return ret;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
return csum_one_extent_buffer(eb);
}
static int check_tree_block_fsid(struct extent_buffer *eb)
@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
static int btree_set_page_dirty(struct page *page)
{
#ifdef DEBUG
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
u64 page_start = page_offset(page);
if (fs_info->sectorsize == PAGE_SIZE) {
BUG_ON(!PagePrivate(page));
eb = (struct extent_buffer *)page->private;
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
return __set_page_dirty_nobuffers(page);
}
ASSERT(PagePrivate(page) && page->private);
subpage = (struct btrfs_subpage *)page->private;
ASSERT(subpage->dirty_bitmap);
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
unsigned long flags;
u64 cur;
u16 tmp = (1 << cur_bit);
spin_lock_irqsave(&subpage->lock, flags);
if (!(tmp & subpage->dirty_bitmap)) {
spin_unlock_irqrestore(&subpage->lock, flags);
cur_bit++;
continue;
}
spin_unlock_irqrestore(&subpage->lock, flags);
cur = page_start + cur_bit * fs_info->sectorsize;
eb = find_extent_buffer(fs_info, cur);
ASSERT(eb);
ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
ASSERT(atomic_read(&eb->refs));
btrfs_assert_tree_locked(eb);
free_extent_buffer(eb);
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
}
#endif
return __set_page_dirty_nobuffers(page);
}
@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(fs_info);
/*
* Acquires fs_info->delete_unused_bgs_mutex to avoid racing
* Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
* after acquiring fs_info->delete_unused_bgs_mutex. So we
* after acquiring fs_info->reclaim_bgs_lock. So we
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
/*
* Reclaim block groups in the reclaim_bgs list after we deleted
* all unused block_groups. This possibly gives us some more free
* space.
*/
btrfs_reclaim_bgs(fs_info);
sleep:
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
@ -2793,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
@ -2803,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@ -2891,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
@ -4249,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->reclaim_bgs_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);

View File

@ -2490,19 +2490,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
int readonly = 0;
block_group = btrfs_lookup_block_group(fs_info, bytenr);
if (!block_group || block_group->ro)
readonly = 1;
if (block_group)
btrfs_put_block_group(block_group);
return readonly;
}
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@ -3355,11 +3342,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
* find a node pointing to this leaf and record operations that
* point to this leaf.
*/
if (btrfs_header_level(buf) == 0) {
read_lock(&fs_info->tree_mod_log_lock);
must_pin = !list_empty(&fs_info->tree_mod_seq_list);
read_unlock(&fs_info->tree_mod_log_lock);
}
if (btrfs_header_level(buf) == 0 &&
test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf);

View File

@ -13,6 +13,7 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
@ -2983,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate)) {
if (is_data_inode(inode))
ret = btrfs_verify_data_csum(io_bio,
bio_offset, page, start, end,
mirror);
bio_offset, page, start, end);
else
ret = btrfs_validate_metadata_buffer(io_bio,
page, start, end, mirror);
@ -3967,7 +3967,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
btrfs_tree_unlock(eb);
if (!ret)
/*
* Either we don't need to submit any tree block, or we're submitting
* subpage eb.
* Subpage metadata doesn't use page locking at all, so we can skip
* the page locking.
*/
if (!ret || fs_info->sectorsize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@ -4012,12 +4018,11 @@ err_unlock:
return ret;
}
static void set_btree_ioerr(struct page *page)
static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
{
struct extent_buffer *eb = (struct extent_buffer *)page->private;
struct btrfs_fs_info *fs_info;
struct btrfs_fs_info *fs_info = eb->fs_info;
SetPageError(page);
btrfs_page_set_error(fs_info, page, eb->start, eb->len);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
@ -4025,7 +4030,6 @@ static void set_btree_ioerr(struct page *page)
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
fs_info = eb->fs_info;
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
eb->len, fs_info->dirty_metadata_batch);
@ -4069,26 +4073,111 @@ static void set_btree_ioerr(struct page *page)
*/
switch (eb->log_index) {
case -1:
set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
break;
case 0:
set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
break;
case 1:
set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
break;
default:
BUG(); /* unexpected, logic error */
}
}
/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
}
rcu_read_unlock();
return NULL;
}
/*
* The endio function for subpage extent buffer write.
*
* Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
* after all extent buffers in the page has finished their writeback.
*/
static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
struct bio *bio)
{
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
u64 bvec_start = page_offset(page) + bvec->bv_offset;
u64 bvec_end = bvec_start + bvec->bv_len - 1;
u64 cur_bytenr = bvec_start;
ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
/* Iterate through all extent buffers in the range */
while (cur_bytenr <= bvec_end) {
struct extent_buffer *eb;
int done;
/*
* Here we can't use find_extent_buffer(), as it may
* try to lock eb->refs_lock, which is not safe in endio
* context.
*/
eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
ASSERT(eb);
cur_bytenr = eb->start + eb->len;
ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
done = atomic_dec_and_test(&eb->io_pages);
ASSERT(done);
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page, eb);
}
btrfs_subpage_clear_writeback(fs_info, page, eb->start,
eb->len);
end_extent_buffer_writeback(eb);
/*
* free_extent_buffer() will grab spinlock which is not
* safe in endio context. Thus here we manually dec
* the ref.
*/
atomic_dec(&eb->refs);
}
}
bio_put(bio);
}
static void end_bio_extent_buffer_writepage(struct bio *bio)
{
struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
if (fs_info->sectorsize < PAGE_SIZE)
return end_bio_subpage_eb_writepage(fs_info, bio);
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@ -4100,7 +4189,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
set_btree_ioerr(page);
set_btree_ioerr(page, eb);
}
end_page_writeback(page);
@ -4114,6 +4203,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
bio_put(bio);
}
/*
* Unlike the work in write_one_eb(), we rely completely on extent locking.
* Page locking is only utilized at minimum to keep the VMM code happy.
*
* Caller should still call write_one_eb() other than this function directly.
* As write_one_eb() has extra preparation before submitting the extent buffer.
*/
static int write_one_subpage_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
bool no_dirty_ebs = false;
int ret;
/* clear_page_dirty_for_io() in subpage helper needs page locked */
lock_page(page);
btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
/* Check if this is the last dirty bit to update nr_written */
no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
eb->start, eb->len);
if (no_dirty_ebs)
clear_page_dirty_for_io(page);
ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
eb->start, eb->len, eb->start - page_offset(page),
&epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
false);
if (ret) {
btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
set_btree_ioerr(page, eb);
unlock_page(page);
if (atomic_dec_and_test(&eb->io_pages))
end_extent_buffer_writeback(eb);
return -EIO;
}
unlock_page(page);
/*
* Submission finished without problem, if no range of the page is
* dirty anymore, we have submitted a page. Update nr_written in wbc.
*/
if (no_dirty_ebs)
update_nr_written(wbc, 1);
return ret;
}
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
@ -4145,6 +4284,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
memzero_extent_buffer(eb, start, end - start);
}
if (eb->fs_info->sectorsize < PAGE_SIZE)
return write_one_subpage_eb(eb, wbc, epd);
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@ -4156,7 +4298,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
if (ret) {
set_btree_ioerr(p);
set_btree_ioerr(p, eb);
if (PageWriteback(p))
end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
@ -4180,6 +4322,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
return ret;
}
/*
* Submit one subpage btree page.
*
* The main difference to submit_eb_page() is:
* - Page locking
* For subpage, we don't rely on page locking at all.
*
* - Flush write bio
* We only flush bio if we may be unable to fit current extent buffers into
* current bio.
*
* Return >=0 for the number of submitted extent buffers.
* Return <0 for fatal error.
*/
static int submit_eb_subpage(struct page *page,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
int ret;
/* Lock and write each dirty extent buffers in the range */
while (bit_start < nbits) {
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
struct extent_buffer *eb;
unsigned long flags;
u64 start;
/*
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
spin_lock(&page->mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&page->mapping->private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
if (!((1 << bit_start) & subpage->dirty_bitmap)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
bit_start++;
continue;
}
start = page_start + bit_start * fs_info->sectorsize;
bit_start += sectors_per_node;
/*
* Here we just want to grab the eb without touching extra
* spin locks, so call find_extent_buffer_nolock().
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&page->mapping->private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
* doesn't return it. We don't need to write back such eb
* anyway.
*/
if (!eb)
continue;
ret = lock_extent_buffer_for_io(eb, epd);
if (ret == 0) {
free_extent_buffer(eb);
continue;
}
if (ret < 0) {
free_extent_buffer(eb);
goto cleanup;
}
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
if (ret < 0)
goto cleanup;
submitted++;
}
return submitted;
cleanup:
/* We hit error, end bio for the submitted extent buffers */
end_write_bio(epd, ret);
return ret;
}
/*
* Submit all page(s) of one extent buffer.
*
@ -4212,6 +4446,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!PagePrivate(page))
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
return submit_eb_subpage(page, wbc, epd);
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&mapping->private_lock);
@ -4652,10 +4889,8 @@ void extent_readahead(struct readahead_control *rac)
int nr;
while ((nr = readahead_page_batch(rac, pagepool))) {
u64 contig_start = page_offset(pagepool[0]);
u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
u64 contig_start = readahead_pos(rac);
u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
contiguous_readpages(pagepool, nr, contig_start, contig_end,
&em_cached, &bio, &bio_flags, &prev_em_start);
@ -5469,25 +5704,21 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
eb = find_extent_buffer_nolock(fs_info, start);
if (!eb)
return NULL;
/*
* Lock our eb's refs_lock to avoid races with
* free_extent_buffer. When we get our eb it might be flagged
* with EXTENT_BUFFER_STALE and another task running
* free_extent_buffer might have seen that flag set,
* eb->refs == 2, that the buffer isn't under IO (dirty and
* Lock our eb's refs_lock to avoid races with free_extent_buffer().
* When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
* another task running free_extent_buffer() might have seen that flag
* set, eb->refs == 2, that the buffer isn't under IO (dirty and
* writeback flags not set) and it's still in the tree (flag
* EXTENT_BUFFER_TREE_REF set), therefore being in the process
* of decrementing the extent buffer's reference count twice.
* So here we could race and increment the eb's reference count,
* clear its stale flag, mark it as dirty and drop our reference
* before the other task finishes executing free_extent_buffer,
* which would later result in an attempt to free an extent
* buffer that is dirty.
* EXTENT_BUFFER_TREE_REF set), therefore being in the process of
* decrementing the extent buffer's reference count twice. So here we
* could race and increment the eb's reference count, clear its stale
* flag, mark it as dirty and drop our reference before the other task
* finishes executing free_extent_buffer, which would later result in
* an attempt to free an extent buffer that is dirty.
*/
if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
spin_lock(&eb->refs_lock);
@ -5495,10 +5726,6 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
}
mark_extent_buffer_accessed(eb, NULL);
return eb;
}
rcu_read_unlock();
return NULL;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@ -5594,6 +5821,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
#if BITS_PER_LONG == 32
if (start >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info,
"extent buffer %llu is beyond 32bit page cache limit", start);
btrfs_err_32bit_limit(fs_info);
return ERR_PTR(-EOVERFLOW);
}
if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
btrfs_warn_32bit_limit(fs_info);
#endif
if (fs_info->sectorsize < PAGE_SIZE &&
offset_in_page(start) + len > PAGE_SIZE) {
btrfs_err(fs_info,
@ -5665,7 +5903,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
WARN_ON(PageDirty(p));
WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@ -5814,28 +6052,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
static void btree_clear_page_dirty(struct page *page)
{
ASSERT(PageDirty(page));
ASSERT(PageLocked(page));
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
if (!PageDirty(page))
__xa_clear_mark(&page->mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct page *page = eb->pages[0];
bool last;
/* btree_clear_page_dirty() needs page locked */
lock_page(page);
last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
eb->len);
if (last)
btree_clear_page_dirty(page);
unlock_page(page);
WARN_ON(atomic_read(&eb->refs) == 0);
}
void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
struct page *page;
if (eb->fs_info->sectorsize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (!PageDirty(page))
continue;
lock_page(page);
WARN_ON(!PagePrivate(page));
clear_page_dirty_for_io(page);
xa_lock_irq(&page->mapping->i_pages);
if (!PageDirty(page))
__xa_clear_mark(&page->mapping->i_pages,
page_index(page), PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&page->mapping->i_pages);
btree_clear_page_dirty(page);
ClearPageError(page);
unlock_page(page);
}
@ -5856,10 +6117,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
if (!was_dirty)
for (i = 0; i < num_pages; i++)
set_page_dirty(eb->pages[i]);
if (!was_dirty) {
bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
/*
* For subpage case, we can have other extent buffers in the
* same page, and in clear_subpage_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
* Thankfully, clear_subpage_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
lock_page(eb->pages[0]);
for (i = 0; i < num_pages; i++)
btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
eb->start, eb->len);
if (subpage)
unlock_page(eb->pages[0]);
}
#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
ASSERT(PageDirty(eb->pages[i]));
@ -6217,12 +6496,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
return ret;
}
/*
* Check that the extent buffer is uptodate.
*
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
static void assert_eb_page_uptodate(const struct extent_buffer *eb,
struct page *page)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
if (fs_info->sectorsize < PAGE_SIZE) {
bool uptodate;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
WARN_ON(!uptodate);
} else {
WARN_ON(!PageUptodate(page));
}
}
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
@ -6232,7 +6533,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
WARN_ON(!PageUptodate(eb->pages[0]));
assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
@ -6257,7 +6558,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
while (len > 0) {
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@ -6286,7 +6587,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
while (len > 0) {
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@ -6344,7 +6645,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
while (len > 0) {
page = dst->pages[i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(dst, page);
cur = min(len, (unsigned long)(PAGE_SIZE - offset));
@ -6406,7 +6707,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
eb_bitmap_offset(eb, start, nr, &i, &offset);
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
@ -6431,7 +6732,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_set) {
@ -6442,7 +6743,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
@ -6474,7 +6775,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_clear) {
@ -6485,7 +6786,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
WARN_ON(!PageUptodate(page));
assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}

View File

@ -66,6 +66,7 @@ enum {
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
struct btrfs_fs_info;
struct btrfs_inode;
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);

View File

@ -9,6 +9,7 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"

View File

@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
else
num_written = btrfs_buffered_write(iocb, from);
/*
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occurred.
*/
spin_lock(&inode->lock);
inode->last_sub_trans = inode->root->log_transid;
spin_unlock(&inode->lock);
btrfs_set_inode_last_sub_trans(inode);
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
@ -2122,7 +2116,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
inode_lock(inode);
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@ -2135,11 +2129,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
&BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
* if it does, any of the following might happen when we are not doing a
* full inode sync:
* Before we acquired the inode's lock and the mmap lock, someone may
* have dirtied more pages in the target range. We need to make sure
* that writeback for any such pages does not start while we are logging
* the inode, because if it does, any of the following might happen when
* we are not doing a full inode sync:
*
* 1) We log an extent after its writeback finishes but before its
* checksums are added to the csum tree, leading to -EIO errors
@ -2154,7 +2148,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@ -2255,7 +2249,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
@ -2285,7 +2279,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@ -2605,16 +2599,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
* extents without inserting a new one, so we must abort the transaction to avoid
* a corruption.
*/
int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_path *path, const u64 start,
const u64 end,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
@ -2662,10 +2657,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
drop_args.drop_cache = true;
while (cur_offset < end) {
drop_args.start = cur_offset;
ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
/* If we are punching a hole decrement the inode's byte count */
if (!extent_info)
btrfs_update_inode_bytes(BTRFS_I(inode), 0,
btrfs_update_inode_bytes(inode, 0,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
@ -2685,8 +2680,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_args.drop_end);
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
@ -2704,7 +2699,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* know to not set disk_i_size in this area until a new
* file extent is inserted here.
*/
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
if (ret) {
@ -2723,8 +2718,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
u64 replace_len = drop_args.drop_end -
extent_info->file_offset;
ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode),
path, extent_info, replace_len,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, replace_len,
drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
@ -2735,9 +2730,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
extent_info->file_offset += replace_len;
}
cur_offset = drop_args.drop_end;
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@ -2756,9 +2749,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
if (!extent_info) {
ret = find_first_non_hole(BTRFS_I(inode), &cur_offset,
&len);
cur_offset = drop_args.drop_end;
len = end - cur_offset;
if (!extent_info && len) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
if (ret && !len) {
@ -2771,14 +2765,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
/*
* If we were cloning, force the next fsync to be a full one since we
* we replaced (or just dropped in the case of cloning holes when
* NO_HOLES is enabled) extents and extent maps.
* This is for the sake of simplicity, and cloning into files larger
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
* NO_HOLES is enabled) file extent items and did not setup new extent
* maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
if (ret)
goto out_trans;
@ -2804,8 +2795,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
*/
if (!extent_info && cur_offset < ino_size &&
cur_offset < drop_args.drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_args.drop_end);
ret = fill_holes(trans, inode, path, cur_offset,
drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
@ -2813,8 +2804,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
} else if (!extent_info && cur_offset < drop_args.drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_args.drop_end - cur_offset);
ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
drop_args.drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@ -2822,7 +2813,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
if (extent_info) {
ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path,
ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
if (ret) {
@ -2868,7 +2859,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
inode_lock(inode);
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@ -2908,7 +2899,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
}
@ -2967,8 +2958,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
lockend, NULL, &trans);
btrfs_free_path(path);
if (ret)
goto out;
@ -3009,7 +3000,7 @@ out_only_mutex:
ret = ret2;
}
}
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@ -3335,7 +3326,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
}
btrfs_inode_lock(inode, 0);
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@ -3377,7 +3368,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@ -3487,7 +3478,7 @@ out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
inode_unlock(inode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
@ -3496,13 +3487,13 @@ out:
return ret;
}
static loff_t find_desired_extent(struct inode *inode, loff_t offset,
static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
loff_t i_size = inode->i_size;
loff_t i_size = inode->vfs_inode.i_size;
u64 lockstart;
u64 lockend;
u64 start;
@ -3525,11 +3516,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
lockend--;
len = lockend - lockstart + 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
while (start < i_size) {
em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
em = btrfs_get_extent_fiemap(inode, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
@ -3551,7 +3541,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
cond_resched();
}
free_extent_map(em);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_extent_cached(&inode->io_tree, lockstart, lockend,
&cached_state);
if (ret) {
offset = ret;
@ -3575,7 +3565,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_DATA:
case SEEK_HOLE:
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
offset = find_desired_extent(inode, offset, whence);
offset = find_desired_extent(BTRFS_I(inode), offset, whence);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}

View File

@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include "misc.h"
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@ -2539,6 +2540,7 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
}
/* All the region is now unusable. Mark it as unused and reclaim */
if (block_group->zone_unusable == block_group->length)
if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
} else if (block_group->zone_unusable >=
div_factor_fine(block_group->length,
fs_info->bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
return 0;
}

View File

@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
* return -EAGAIN
* BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
{
@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
}
inode_lock(inode);
}
if (ilock_flags & BTRFS_ILOCK_MMAP)
down_write(&BTRFS_I(inode)->i_mmap_lock);
return 0;
}
@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
*/
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
{
if (ilock_flags & BTRFS_ILOCK_MMAP)
up_write(&BTRFS_I(inode)->i_mmap_lock);
if (ilock_flags & BTRFS_ILOCK_SHARED)
inode_unlock_shared(inode);
else
@ -1516,7 +1521,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
int *page_started, int force,
int *page_started,
unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@ -1530,6 +1535,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
const bool force = inode->flags & BTRFS_INODE_NODATACOW;
path = btrfs_alloc_path();
if (!path) {
@ -1863,23 +1869,16 @@ error:
return ret;
}
static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
!(inode->flags & BTRFS_INODE_PREALLOC))
return 0;
/*
* @defrag_bytes is a hint value, no spinlock held here,
* if is not zero, it means the file is defragging.
* Force cow if given extent needs to be defragged.
*/
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
return 1;
return 0;
test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
0, NULL))
return false;
return true;
}
return false;
}
/*
@ -1891,17 +1890,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
struct writeback_control *wbc)
{
int ret;
int force_cow = need_force_cow(inode, start, end);
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
if (should_nocow(inode, start, end)) {
ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 1, nr_written);
} else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
page_started, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
@ -3151,10 +3145,9 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
* @mirror: mirror number
*/
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
struct page *page, u64 start, u64 end, int mirror)
struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@ -3393,15 +3386,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
int is_dead_root = 0;
/*
* this is an orphan in the tree root. Currently these
* This is an orphan in the tree root. Currently these
* could come from 2 sources:
* a) a snapshot deletion in progress
* a) a root (snapshot/subvolume) deletion in progress
* b) a free space cache inode
* We need to distinguish those two, as the snapshot
* orphan must not get deleted.
* find_dead_roots already ran before us, so if this
* is a snapshot deletion, we should find the root
* in the fs_roots radix tree.
* We need to distinguish those two, as the orphan item
* for a root must not get deleted before the deletion
* of the snapshot/subvolume's tree completes.
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
* fs_info->fs_roots_radix. So here we can find if an
* orphan item corresponds to a deleted root by looking
* up the root from that radix tree.
*/
spin_lock(&fs_info->fs_roots_radix_lock);
@ -4332,7 +4329,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
goto out_end_trans;
}
btrfs_record_root_in_trans(trans, dest);
ret = btrfs_record_root_in_trans(trans, dest);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
@ -7026,7 +7027,7 @@ next:
if (ret)
goto out;
} else {
map = kmap(page);
map = kmap_local_page(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
if (pg_offset + copy_size < PAGE_SIZE) {
@ -7034,7 +7035,7 @@ next:
PAGE_SIZE - pg_offset -
copy_size);
}
kunmap(page);
kunmap_local(map);
}
flush_dcache_page(page);
}
@ -7262,6 +7263,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
return em;
}
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_block_group *block_group;
bool readonly = false;
block_group = btrfs_lookup_block_group(fs_info, bytenr);
if (!block_group || block_group->ro)
readonly = true;
if (block_group)
btrfs_put_block_group(block_group);
return readonly;
}
/*
* Check if we can do nocow write into the range [@offset, @offset + @len)
*
@ -8403,17 +8417,11 @@ again:
* for the finish_ordered_io
*/
if (TestClearPagePrivate2(page)) {
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
new_len = start - ordered->file_offset;
if (new_len < ordered->truncated_len)
ordered->truncated_len = new_len;
spin_unlock_irq(&tree->lock);
ordered->truncated_len = min(ordered->truncated_len,
start - ordered->file_offset);
spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
start,
@ -8539,6 +8547,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
down_read(&BTRFS_I(inode)->i_mmap_lock);
lock_page(page);
size = i_size_read(inode);
@ -8567,6 +8576,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
@ -8619,11 +8629,10 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
BTRFS_I(inode)->last_trans = fs_info->generation;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
@ -8632,6 +8641,7 @@ again:
out_unlock:
unlock_page(page);
up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
init_rwsem(&ei->i_mmap_lock);
return inode;
}
@ -9101,8 +9112,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
if (dest != root) {
ret = btrfs_record_root_in_trans(trans, dest);
if (ret)
goto out_fail;
}
/*
* We need to find a free sequence number both in the source and
@ -9406,8 +9420,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_notrans;
}
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
if (dest != root) {
ret = btrfs_record_root_in_trans(trans, dest);
if (ret)
goto out_fail;
}
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
if (ret)
@ -9919,7 +9936,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
goto free_qgroup;
}
ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
ret = btrfs_replace_file_extents(inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);

View File

@ -226,7 +226,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (ret)
return ret;
inode_lock(inode);
btrfs_inode_lock(inode, 0);
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
@ -353,7 +353,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
out_end_trans:
btrfs_end_transaction(trans);
out_unlock:
inode_unlock(inode);
btrfs_inode_unlock(inode, 0);
mnt_drop_write_file(file);
return ret;
}
@ -449,7 +449,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
if (ret)
return ret;
inode_lock(inode);
btrfs_inode_lock(inode, 0);
old_flags = binode->flags;
old_i_flags = inode->i_flags;
@ -501,7 +501,7 @@ out_unlock:
inode->i_flags = old_i_flags;
}
inode_unlock(inode);
btrfs_inode_unlock(inode, 0);
mnt_drop_write_file(file);
return ret;
@ -697,8 +697,6 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
leaf = NULL;
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
@ -707,8 +705,22 @@ static noinline int create_subvol(struct inode *dir,
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret)
if (ret) {
/*
* Since we don't abort the transaction in this case, free the
* tree block so that we don't leak space and leave the
* filesystem in an inconsistent state (an extent item in the
* extent tree without backreferences). Also no need to have
* the tree block locked since it is not in any tree at this
* point, so no other task can find it and use it.
*/
btrfs_free_tree_block(trans, root, leaf, 0, 1);
free_extent_buffer(leaf);
goto fail;
}
free_extent_buffer(leaf);
leaf = NULL;
key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
@ -721,7 +733,12 @@ static noinline int create_subvol(struct inode *dir,
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0;
btrfs_record_root_in_trans(trans, new_root);
ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
btrfs_put_root(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
@ -1014,7 +1031,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
inode_unlock(dir);
btrfs_inode_unlock(dir, 0);
return error;
}
@ -1612,7 +1629,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index += cluster;
}
inode_lock(inode);
btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
} else {
@ -1621,13 +1638,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
}
if (ret < 0) {
inode_unlock(inode);
btrfs_inode_unlock(inode, 0);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
inode_unlock(inode);
btrfs_inode_unlock(inode, 0);
if (newer_than) {
if (newer_off == (u64)-1)
@ -1675,9 +1692,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
out_ra:
if (do_compress) {
inode_lock(inode);
btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
inode_unlock(inode);
btrfs_inode_unlock(inode, 0);
}
if (!file)
kfree(ra);
@ -3112,9 +3129,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
inode_lock(inode);
btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
btrfs_inode_unlock(inode, 0);
if (!err) {
fsnotify_rmdir(dir, dentry);
d_delete(dentry);
@ -3123,7 +3140,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
out_dput:
dput(dentry);
out_unlock_dir:
inode_unlock(dir);
btrfs_inode_unlock(dir, 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:

View File

@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
char *data_in;
char *cpage_out;
char *cpage_out, *sizes_ptr;
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
cpage_out = kmap(pages[0]);
write_compress_length(cpage_out, tot_out);
kunmap(pages[0]);
sizes_ptr = kmap_local_page(pages[0]);
write_compress_length(sizes_ptr, tot_out);
kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;

View File

@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
return NULL;
}
/*
* helper to check if a given offset is inside a given entry
*/
static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
{
if (file_offset < entry->file_offset ||
entry->file_offset + entry->num_bytes <= file_offset)
return 0;
return 1;
}
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
if (tree->last) {
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
rb_node);
if (offset_in_entry(entry, file_offset))
if (in_range(file_offset, entry->file_offset, entry->num_bytes))
return tree->last;
}
ret = __tree_search(root, file_offset, &prev);
@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!offset_in_entry(entry, *file_offset))
if (!in_range(*file_offset, entry->file_offset, entry->num_bytes))
goto out;
dec_start = max(*file_offset, entry->file_offset);
@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
if (!offset_in_entry(entry, file_offset))
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
goto out;
if (io_size > entry->bytes_left)
@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
if (!offset_in_entry(entry, file_offset))
if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
if (entry)
refcount_inc(&entry->refs);

View File

@ -39,8 +39,8 @@ struct btrfs_ordered_sum {
*/
enum {
/*
* Different types for direct io, one and only one of the 4 type can
* be set when creating ordered extent.
* Different types for ordered extents, one and only one of the 4 types
* need to be set when creating ordered extent.
*
* REGULAR: For regular non-compressed COW write
* NOCOW: For NOCOW write into existing non-hole extent

View File

@ -23,6 +23,7 @@
#include "qgroup.h"
#include "block-group.h"
#include "sysfs.h"
#include "tree-mod-log.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@ -2639,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
/*
* Use SEQ_LAST as time_seq to do special search, which
* doesn't lock tree or delayed_refs and search current
* root. It's safe inside commit_transaction().
* Use BTRFS_SEQ_LAST as time_seq to do special search,
* which doesn't lock tree or delayed_refs and search
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
record->bytenr, SEQ_LAST, &new_roots, false);
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@ -3543,37 +3544,19 @@ static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
bool can_commit = true;
/*
* If current process holds a transaction, we shouldn't flush, as we
* assume all space reservation happens before a transaction handle is
* held.
*
* But there are cases like btrfs_delayed_item_reserve_metadata() where
* we try to reserve space with one transction handle already held.
* In that case we can't commit transaction, but at least try to end it
* and hope the started data writes can free some space.
*/
if (current->journal_info &&
current->journal_info != BTRFS_SEND_TRANS_STUB)
can_commit = false;
/* Can't hold an open transaction or we run the risk of deadlocking */
ASSERT(current->journal_info == NULL ||
current->journal_info == BTRFS_SEND_TRANS_STUB);
if (WARN_ON(current->journal_info &&
current->journal_info != BTRFS_SEND_TRANS_STUB))
return 0;
/*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
/*
* We are already holding a transaction, thus we can block other
* threads from flushing. So exit right now. This increases
* the chance of EDQUOT for heavy load and near limit cases.
* But we can argue that if we're already near limit, EDQUOT is
* unavoidable anyway.
*/
if (!can_commit)
return 0;
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
@ -3590,10 +3573,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
goto out;
}
if (can_commit)
ret = btrfs_commit_transaction(trans);
else
ret = btrfs_end_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@ -3646,8 +3626,7 @@ cleanup:
qgroup_unreserve_range(inode, reserved, start, len);
out:
if (new_reserved) {
extent_changeset_release(reserved);
kfree(reserved);
extent_changeset_free(reserved);
*reserved_ret = NULL;
}
return ret;

View File

@ -13,6 +13,7 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
pointers[stripe] = kmap(p);
pointers[stripe] = kmap_local_page(p);
}
/* then add the parity stripe */
p = rbio_pstripe_page(rbio, pagenr);
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
pointers[stripe++] = kmap_local_page(p);
if (has_qstripe) {
@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
*/
p = rbio_qstripe_page(rbio, pagenr);
SetPageUptodate(p);
pointers[stripe++] = kmap(p);
pointers[stripe++] = kmap_local_page(p);
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
copy_page(pointers[nr_data], pointers[0]);
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
for (stripe = 0; stripe < rbio->real_stripes; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
for (stripe = stripe - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
/*
@ -1776,6 +1775,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
int pagenr, stripe;
void **pointers;
void **unmap_array;
int faila = -1, failb = -1;
struct page *page;
blk_status_t err;
@ -1787,6 +1787,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto cleanup_io;
}
/*
* Store copy of pointers that does not get reordered during
* reconstruction so that kunmap_local works.
*/
unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!unmap_array) {
err = BLK_STS_RESOURCE;
goto cleanup_pointers;
}
faila = rbio->faila;
failb = rbio->failb;
@ -1808,8 +1818,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
!test_bit(pagenr, rbio->dbitmap))
continue;
/* setup our array of pointers with pages
* from each stripe
/*
* Setup our array of pointers with pages from each stripe
*
* NOTE: store a duplicate array of pointers to preserve the
* pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
@ -1823,7 +1836,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
pointers[stripe] = kmap(page);
pointers[stripe] = kmap_local_page(page);
unmap_array[stripe] = pointers[stripe];
}
/* all raid6 handling here */
@ -1916,24 +1930,14 @@ pstripe:
}
}
}
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
kunmap(page);
}
for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
kunmap_local(unmap_array[stripe]);
}
err = BLK_STS_OK;
cleanup:
kfree(unmap_array);
cleanup_pointers:
kfree(pointers);
cleanup_io:
@ -2358,13 +2362,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
goto cleanup;
}
SetPageUptodate(q_page);
pointers[rbio->real_stripes - 1] = kmap(q_page);
pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
pointers[nr_data] = kmap(p_page);
pointers[nr_data] = kmap_local_page(p_page);
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *p;
@ -2372,7 +2376,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
pointers[stripe] = kmap(p);
pointers[stripe] = kmap_local_page(p);
}
if (has_qstripe) {
@ -2387,22 +2391,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* Check scrubbing parity and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
parity = kmap(p);
parity = kmap_local_page(p);
if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
copy_page(parity, pointers[rbio->scrubp]);
else
/* Parity is right, needn't writeback */
bitmap_clear(rbio->dbitmap, pagenr, 1);
kunmap(p);
kunmap_local(parity);
for (stripe = 0; stripe < nr_data; stripe++)
kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
for (stripe = nr_data - 1; stripe >= 0; stripe--)
kunmap_local(pointers[stripe]);
}
kunmap(p_page);
kunmap_local(pointers[nr_data]);
__free_page(p_page);
if (q_page) {
kunmap(q_page);
kunmap_local(pointers[rbio->real_stripes - 1]);
__free_page(q_page);
}

View File

@ -478,9 +478,9 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
ret = btrfs_replace_file_extents(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
drop_start, new_key.offset + datal - 1,
&clone_info, &trans);
if (ret)
goto out;
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
@ -567,8 +567,8 @@ process_slot:
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
last_dest_end, destoff + len - 1, NULL, &trans);
if (ret)
goto out;
@ -604,6 +604,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
{
if (inode1 < inode2)
swap(inode1, inode2);
down_write(&BTRFS_I(inode1)->i_mmap_lock);
down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
}
static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
{
up_write(&BTRFS_I(inode1)->i_mmap_lock);
up_write(&BTRFS_I(inode2)->i_mmap_lock);
}
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
@ -820,6 +834,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
len, remap_flags);
}
static bool file_sync_write(const struct file *file)
{
if (file->f_flags & (__O_SYNC | O_DSYNC))
return true;
if (IS_SYNC(file_inode(file)))
return true;
return false;
}
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
@ -832,10 +856,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (same_inode)
inode_lock(src_inode);
else
if (same_inode) {
btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
} else {
lock_two_nondirectories(src_inode, dst_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
}
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
@ -848,10 +874,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
if (same_inode)
inode_unlock(src_inode);
else
if (same_inode) {
btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
unlock_two_nondirectories(src_inode, dst_inode);
}
/*
* If either the source or the destination file was opened with O_SYNC,
* O_DSYNC or has the S_SYNC attribute, fsync both the destination and
* source files/ranges, so that after a successful return (0) followed
* by a power failure results in the reflinked data to be readable from
* both files/ranges.
*/
if (ret == 0 && len > 0 &&
(file_sync_write(src_file) || file_sync_write(dst_file))) {
ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
if (ret == 0)
ret = btrfs_sync_file(dst_file, destoff,
destoff + len - 1, 0);
}
return ret < 0 ? ret : len;
}

View File

@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_panic(fs_info, -EEXIST,
btrfs_err(fs_info,
"Duplicate root found for start=%llu while inserting into relocation tree",
node->bytenr);
return -EEXIST;
}
list_add_tail(&root->root_list, &rc->reloc_roots);
@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
int ret;
int ret = 0;
bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
BUG_ON(!root_item);
if (!root_item)
return ERR_PTR(-ENOMEM);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(ret);
if (ret)
goto fail;
/*
* Set the last_snapshot field to the generation of the commit
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
*/
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(ret);
if (ret)
goto fail;
}
/*
* We have changed references at this point, we must abort the
* transaction if anything fails.
*/
must_abort = true;
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
BUG_ON(ret);
if (ret)
goto fail;
kfree(root_item);
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
BUG_ON(IS_ERR(reloc_root));
if (IS_ERR(reloc_root)) {
ret = PTR_ERR(reloc_root);
goto abort;
}
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
fail:
kfree(root_item);
abort:
if (must_abort)
btrfs_abort_transaction(trans, ret);
return ERR_PTR(ret);
}
/*
@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
trans->block_rsv = rsv;
if (IS_ERR(reloc_root))
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
ASSERT(ret != -EEXIST);
if (ret) {
/* Pairs with create_reloc_root */
btrfs_put_root(reloc_root);
return ret;
}
root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
int ret;
if (!have_reloc_root(root))
goto out;
return 0;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
BUG_ON(ret);
btrfs_put_root(reloc_root);
out:
return 0;
return ret;
}
/*
@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@ -1205,7 +1233,11 @@ again:
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
BTRFS_NESTING_COW);
BUG_ON(ret);
if (ret) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
return ret;
}
}
if (next_key) {
@ -1217,7 +1249,7 @@ again:
parent = eb;
while (1) {
level = btrfs_header_level(parent);
BUG_ON(level < lowest_level);
ASSERT(level >= lowest_level);
ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
@ -1265,7 +1297,11 @@ again:
ret = btrfs_cow_block(trans, dest, eb, parent,
slot, &eb,
BTRFS_NESTING_COW);
BUG_ON(ret);
if (ret) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
break;
}
}
btrfs_tree_unlock(parent);
@ -1289,7 +1325,11 @@ again:
path->lowest_level = level;
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
path->lowest_level = 0;
BUG_ON(ret);
if (ret) {
if (ret > 0)
ret = -ENOENT;
break;
}
/*
* Info qgroup to trace both subtrees.
@ -1329,27 +1369,39 @@ again:
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
btrfs_unlock_up_safe(path, 0);
@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level,
/*
* Insert current subvolume into reloc_control::dirty_subvol_roots
*/
static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_root *root)
{
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
sizeof(reloc_root_item->drop_progress));
btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
btrfs_update_reloc_root(trans, root);
ret = btrfs_update_reloc_root(trans, root);
if (ret)
return ret;
if (list_empty(&root->reloc_dirty_list)) {
btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
return 0;
}
static int clean_dirty_subvols(struct reloc_control *rc)
@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
if (ret == 0)
insert_dirty_subvol(trans, rc, root);
if (ret == 0) {
ret = insert_dirty_subvol(trans, rc, root);
if (ret)
btrfs_abort_transaction(trans, ret);
}
if (trans)
btrfs_end_transaction_throttle(trans);
@ -1825,8 +1885,18 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
if (IS_ERR(root)) {
/*
* Even if we have an error we need this reloc root
* back on our list so we can clean up properly.
*/
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_abort_transaction(trans, (int)PTR_ERR(root));
if (!err)
err = PTR_ERR(root);
break;
}
ASSERT(root->reloc_root == reloc_root);
/*
* set reference count to 1, so btrfs_recover_relocation
@ -1834,16 +1904,27 @@ again:
*/
if (!err)
btrfs_set_root_refs(&reloc_root->root_item, 1);
btrfs_update_reloc_root(trans, root);
ret = btrfs_update_reloc_root(trans, root);
/*
* Even if we have an error we need this reloc root back on our
* list so we can clean up properly.
*/
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
if (ret) {
btrfs_abort_transaction(trans, ret);
if (!err)
err = ret;
break;
}
}
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
btrfs_commit_transaction(trans);
err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@ -1888,8 +1969,29 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
if (IS_ERR(root)) {
/*
* For recovery we read the fs roots on mount,
* and if we didn't find the root then we marked
* the reloc root as a garbage root. For normal
* relocation obviously the root should exist in
* memory. However there's no reason we can't
* handle the error properly here just in case.
*/
ASSERT(0);
ret = PTR_ERR(root);
goto out;
}
if (root->reloc_root != reloc_root) {
/*
* This is actually impossible without something
* going really wrong (like weird race condition
* or cosmic rays).
*/
ASSERT(0);
ret = -EINVAL;
goto out;
}
ret = merge_reloc_root(rc, root);
btrfs_put_root(root);
if (ret) {
@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
/*
* This should succeed, since we can't have a reloc root without having
* already looked up the actual root and created the reloc root for this
* root.
*
* However if there's some sort of corruption where we have a ref to a
* reloc root without a corresponding root this could return ENOENT.
*/
if (IS_ERR(root)) {
ASSERT(0);
return PTR_ERR(root);
}
if (root->reloc_root != reloc_root) {
ASSERT(0);
btrfs_err(fs_info,
"root %llu has two reloc roots associated with it",
reloc_root->root_key.offset);
btrfs_put_root(root);
return -EUCLEAN;
}
ret = btrfs_record_root_in_trans(trans, root);
btrfs_put_root(root);
@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
int ret;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
BUG_ON(!root);
BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
/*
* If there is no root, then our references for this block are
* incomplete, as we should be able to walk all the way up to a
* block that is owned by a root.
*
* This path is only for SHAREABLE roots, so if we come upon a
* non-SHAREABLE root then we have backrefs that resolve
* improperly.
*
* Both of these cases indicate file system corruption, or a bug
* in the backref walking code.
*/
if (!root) {
ASSERT(0);
btrfs_err(trans->fs_info,
"bytenr %llu doesn't have a backref path ending in a root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ASSERT(0);
btrfs_err(trans->fs_info,
"bytenr %llu has multiple refs with one ending in a non-shareable root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
break;
}
btrfs_record_root_in_trans(trans, root);
ret = btrfs_record_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
root = root->reloc_root;
/*
* We could have raced with another thread which failed, so
* root->reloc_root may not be set, return ENOENT in this case.
*/
if (!root)
return ERR_PTR(-ENOENT);
if (next->new_bytenr != root->node->start) {
BUG_ON(next->new_bytenr);
BUG_ON(!list_empty(&next->list));
/*
* We just created the reloc root, so we shouldn't have
* ->new_bytenr set and this shouldn't be in the changed
* list. If it is then we have multiple roots pointing
* at the same bytenr which indicates corruption, or
* we've made a mistake in the backref walking code.
*/
ASSERT(next->new_bytenr == 0);
ASSERT(list_empty(&next->list));
if (next->new_bytenr || !list_empty(&next->list)) {
btrfs_err(trans->fs_info,
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
node->bytenr, next->bytenr);
return ERR_PTR(-EUCLEAN);
}
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!next || next->level <= node->level)
break;
}
if (!root)
return NULL;
if (!root) {
/*
* This can happen if there's fs corruption or if there's a bug
* in the backref lookup code.
*/
ASSERT(0);
return ERR_PTR(-ENOENT);
}
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
BUG_ON(!root);
/*
* This can occur if we have incomplete extent refs leading all
* the way up a particular path, in this case return -EUCLEAN.
*/
if (!root)
return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
int slot;
int ret = 0;
BUG_ON(lowest && node->eb);
/*
* If we are lowest then this is the first time we're processing this
* block, and thus shouldn't have an eb associated with it yet.
*/
ASSERT(!lowest || !node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
upper = edge->node[UPPER];
root = select_reloc_root(trans, rc, upper, edges);
BUG_ON(!root);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto next;
}
if (upper->eb && !upper->locked) {
if (!lowest) {
@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
free_extent_buffer(eb);
if (ret < 0)
goto next;
BUG_ON(node->eb != eb);
/*
* We've just COWed this block, it should have updated
* the correct backref node entry.
*/
ASSERT(node->eb == eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb));
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
BUG_ON(ret);
if (!ret)
ret = btrfs_drop_subtree(trans, root, eb,
upper->eb);
if (ret)
btrfs_abort_transaction(trans, ret);
}
next:
if (!upper->pending)
@ -2302,7 +2498,12 @@ next:
}
path->lowest_level = 0;
BUG_ON(ret == -ENOSPC);
/*
* We should have allocated all of our space in the block rsv and thus
* shouldn't ENOSPC.
*/
ASSERT(ret != -ENOSPC);
return ret;
}
@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(node->processed);
root = select_one_root(node);
if (root == ERR_PTR(-ENOENT)) {
if (IS_ERR(root)) {
ret = PTR_ERR(root);
/* See explanation in select_one_root for the -EUCLEAN case. */
ASSERT(ret == -ENOENT);
if (ret == -ENOENT) {
ret = 0;
update_processed_blocks(rc, node);
}
goto out;
}
if (root) {
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
BUG_ON(node->new_bytenr);
BUG_ON(!list_empty(&node->list));
btrfs_record_root_in_trans(trans, root);
/*
* This block was the root block of a root, and this is
* the first time we're processing the block and thus it
* should not have had the ->new_bytenr modified and
* should have not been included on the changed list.
*
* However in the case of corruption we could have
* multiple refs pointing to the same block improperly,
* and thus we would trip over these checks. ASSERT()
* for the developer case, because it could indicate a
* bug in the backref code, however error out for a
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
ASSERT(list_empty(&node->list));
if (node->new_bytenr || !list_empty(&node->list)) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
ret = -EUCLEAN;
goto out;
}
ret = btrfs_record_root_in_trans(trans, root);
if (ret)
goto out;
/*
* Another thread could have failed, need to check if we
* have reloc_root actually set.
*/
if (!root->reloc_root) {
ret = -ENOENT;
goto out;
}
root = root->reloc_root;
node->new_bytenr = root->node->start;
btrfs_put_root(node->root);
@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return btrfs_end_transaction(trans);
}
inode_lock(&inode->vfs_inode);
btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
break;
}
inode_unlock(&inode->vfs_inode);
btrfs_inode_unlock(&inode->vfs_inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc)
mutex_unlock(&fs_info->reloc_mutex);
}
static int check_extent_flags(u64 flags)
{
if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
return 1;
if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
return 1;
if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
return 1;
return 0;
}
static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
btrfs_commit_transaction(trans);
return 0;
return btrfs_commit_transaction(trans);
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct btrfs_path *path;
struct btrfs_extent_item *ei;
u64 flags;
u32 item_size;
int ret;
int err = 0;
int progress = 0;
@ -3334,19 +3556,7 @@ restart:
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
if (item_size >= sizeof(*ei)) {
flags = btrfs_extent_flags(path->nodes[0], ei);
ret = check_extent_flags(flags);
BUG_ON(ret);
} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
err = -EINVAL;
btrfs_print_v0_err(trans->fs_info);
btrfs_abort_transaction(trans, err);
break;
} else {
BUG();
}
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
@ -3445,7 +3655,9 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
btrfs_commit_transaction(trans);
ret = btrfs_commit_transaction(trans);
if (ret && !err)
err = ret;
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
@ -3488,6 +3700,35 @@ out:
return ret;
}
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = objectid;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
goto out;
}
ret = btrfs_del_item(trans, root, path);
out:
if (ret)
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
}
/*
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
goto out;
err = __insert_orphan_inode(trans, root, objectid);
BUG_ON(err);
if (err)
goto out;
inode = btrfs_iget(fs_info->sb, objectid, root);
BUG_ON(IS_ERR(inode));
if (IS_ERR(inode)) {
delete_orphan_inode(trans, root, objectid);
err = PTR_ERR(inode);
inode = NULL;
goto out;
}
BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
}
err = __add_reloc_root(reloc_root);
BUG_ON(err < 0); /* -ENOMEM or logic error */
ASSERT(err != -EEXIST);
if (err) {
list_add_tail(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(fs_root);
btrfs_end_transaction(trans);
goto out_unset;
}
fs_root->reloc_root = btrfs_grab_root(reloc_root);
btrfs_put_root(fs_root);
}
@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
ASSERT(ret != -EEXIST);
if (ret) {
/* Pairs with create_reloc_root */
btrfs_put_root(reloc_root);
return ret;
}
new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)

View File

@ -206,9 +206,6 @@ struct full_stripe_lock {
struct mutex mutex;
};
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
spin_lock(&cache->lock);
if (!cache->to_copy) {
spin_unlock(&cache->lock);
ro_set = 0;
goto done;
btrfs_put_block_group(cache);
goto skip;
}
spin_unlock(&cache->lock);
}
@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
cache, found_key.offset))
ro_set = 0;
done:
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;

View File

@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
path->reada = READA_FORWARD_ALWAYS;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
@ -6688,15 +6689,35 @@ out:
return ret;
}
static int tree_move_down(struct btrfs_path *path, int *level)
static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
{
struct extent_buffer *eb;
struct extent_buffer *parent = path->nodes[*level];
int slot = path->slots[*level];
const int nritems = btrfs_header_nritems(parent);
u64 reada_max;
u64 reada_done = 0;
BUG_ON(*level == 0);
eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
return PTR_ERR(eb);
/*
* Trigger readahead for the next leaves we will process, so that it is
* very likely that when we need them they are already in memory and we
* will not block on disk IO. For nodes we only do readahead for one,
* since the time window between processing nodes is typically larger.
*/
reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
for (slot++; slot < nritems && reada_done < reada_max; slot++) {
if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
btrfs_readahead_node_child(parent, slot);
reada_done += eb->fs_info->nodesize;
}
}
path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
static int tree_advance(struct btrfs_path *path,
int *level, int root_level,
int allow_down,
struct btrfs_key *key)
struct btrfs_key *key,
u64 reada_min_gen)
{
int ret;
if (*level == 0 || !allow_down) {
ret = tree_move_next_or_upnext(path, level, root_level);
} else {
ret = tree_move_down(path, level);
ret = tree_move_down(path, level, reada_min_gen);
}
if (ret >= 0) {
if (*level == 0)
@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
u64 right_blockptr;
u64 left_gen;
u64 right_gen;
u64 reada_min_gen;
left_path = btrfs_alloc_path();
if (!left_path) {
@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
/*
* Our right root is the parent root, while the left root is the "send"
* root. We know that all new nodes/leaves in the left root must have
* a generation greater than the right root's generation, so we trigger
* readahead for those nodes and leaves of the left root, as we know we
* will need to read them at some point.
*/
reada_min_gen = btrfs_header_generation(right_root->commit_root);
up_read(&fs_info->commit_root_sem);
if (left_level == 0)
@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = tree_advance(left_path, &left_level,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
&left_key);
&left_key, reada_min_gen);
if (ret == -1)
left_end_reached = ADVANCE;
else if (ret < 0)
@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = tree_advance(right_path, &right_level,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
&right_key);
&right_key, reada_min_gen);
if (ret == -1)
right_end_reached = ADVANCE;
else if (ret < 0)

View File

@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* of heavy DIO or ordered reservations, preemptive flushing will just
* waste time and cause us to slow down.
*/
ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
used += fs_info->delayed_refs_rsv.reserved +
fs_info->delayed_block_rsv.reserved;

View File

@ -4,6 +4,64 @@
#include "ctree.h"
#include "subpage.h"
/*
* Subpage (sectorsize < PAGE_SIZE) support overview:
*
* Limitations:
*
* - Only support 64K page size for now
* This is to make metadata handling easier, as 64K page would ensure
* all nodesize would fit inside one page, thus we don't need to handle
* cases where a tree block crosses several pages.
*
* - Only metadata read-write for now
* The data read-write part is in development.
*
* - Metadata can't cross 64K page boundary
* btrfs-progs and kernel have done that for a while, thus only ancient
* filesystems could have such problem. For such case, do a graceful
* rejection.
*
* Special behavior:
*
* - Metadata
* Metadata read is fully supported.
* Meaning when reading one tree block will only trigger the read for the
* needed range, other unrelated range in the same page will not be touched.
*
* Metadata write support is partial.
* The writeback is still for the full page, but we will only submit
* the dirty extent buffers in the page.
*
* This means, if we have a metadata page like this:
*
* Page offset
* 0 16K 32K 48K 64K
* |/////////| |///////////|
* \- Tree block A \- Tree block B
*
* Even if we just want to writeback tree block A, we will also writeback
* tree block B if it's also dirty.
*
* This may cause extra metadata writeback which results more COW.
*
* Implementation:
*
* - Common
* Both metadata and data will use a new structure, btrfs_subpage, to
* record the status of each sector inside a page. This provides the extra
* granularity needed.
*
* - Metadata
* Since we have multiple tree blocks inside one page, we can't rely on page
* locking anymore, or we will have greatly reduced concurrency or even
* deadlocks (hold one tree lock while trying to lock another tree lock in
* the same page).
*
* Thus for metadata locking, subpage support relies on io_tree locking only.
* This means a slightly higher tree locking latency.
*/
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
subpage->dirty_bitmap |= tmp;
spin_unlock_irqrestore(&subpage->lock, flags);
set_page_dirty(page);
}
/*
* Extra clear_and_test function for subpage dirty bitmap.
*
* Return true if we're the last bits in the dirty_bitmap and clear the
* dirty_bitmap.
* Return false otherwise.
*
* NOTE: Callers should manually clear page dirty for true case, as we have
* extra handling for tree blocks.
*/
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
unsigned long flags;
bool last = false;
spin_lock_irqsave(&subpage->lock, flags);
subpage->dirty_bitmap &= ~tmp;
if (subpage->dirty_bitmap == 0)
last = true;
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
}
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
bool last;
last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
if (last)
clear_page_dirty_for_io(page);
}
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
subpage->writeback_bitmap |= tmp;
set_page_writeback(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
subpage->writeback_bitmap &= ~tmp;
if (subpage->writeback_bitmap == 0)
end_page_writeback(page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
PageUptodate);
IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
PageDirty);
IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
PageWriteback);

View File

@ -20,6 +20,8 @@ struct btrfs_subpage {
spinlock_t lock;
u16 uptodate_bitmap;
u16 error_bitmap;
u16 dirty_bitmap;
u16 writeback_bitmap;
union {
/*
* Structures only used by metadata
@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(error);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
DECLARE_BTRFS_SUBPAGE_OPS(writeback);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len);
#endif

View File

@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
}
#endif
#if BITS_PER_LONG == 32
void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
btrfs_warn(fs_info,
"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_warn(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
btrfs_err(fs_info, "reached 32bit limit for logical addresses");
btrfs_err(fs_info,
"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_err(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
#endif
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this

View File

@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
BTRFS_ATTR(static_feature, supported_rescue_options,
supported_rescue_options_show);
static ssize_t supported_sectorsizes_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
ssize_t ret = 0;
/* Only sectorsize == PAGE_SIZE is now supported */
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
return ret;
}
BTRFS_ATTR(static_feature, supported_sectorsizes,
supported_sectorsizes_show);
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
NULL
};
@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
ssize_t ret;
ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
return ret;
}
static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
struct kobj_attribute *a,
const char *buf, size_t len)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
int thresh;
int ret;
ret = kstrtoint(buf, 10, &thresh);
if (ret)
return ret;
if (thresh <= 50 || thresh > 100)
return -EINVAL;
fs_info->bg_reclaim_threshold = thresh;
return len;
}
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, exclusive_operation),
BTRFS_ATTR_PTR(, generation),
BTRFS_ATTR_PTR(, read_policy),
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
NULL,
};

View File

@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->chunk_bytes_reserved)
return;
@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}
@ -383,6 +386,8 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* lock. smp_wmb() makes sure that all the writes above are
* done before we pop in the zero below
*/
btrfs_init_reloc_root(trans, root);
ret = btrfs_init_reloc_root(trans, root);
smp_mb__before_atomic();
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
return 0;
return ret;
}
@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&fs_info->reloc_mutex);
record_root_in_trans(trans, root, 0);
ret = record_root_in_trans(trans, root, 0);
mutex_unlock(&fs_info->reloc_mutex);
return 0;
return ret;
}
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
@ -741,7 +748,16 @@ got_it:
* Thus it need to be called after current->journal_info initialized,
* or we can deadlock.
*/
btrfs_record_root_in_trans(h, root);
ret = btrfs_record_root_in_trans(h, root);
if (ret) {
/*
* The transaction handle is fully initialized and linked with
* other structures so it needs to be ended in case of errors,
* not just freed.
*/
btrfs_end_transaction(h);
return ERR_PTR(ret);
}
return h;
@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
ret2 = btrfs_update_reloc_root(trans, root);
if (ret2)
return ret2;
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* recorded root will never be updated again, causing an outdated root
* item.
*/
record_root_in_trans(trans, src, 1);
ret = record_root_in_trans(trans, src, 1);
if (ret)
return ret;
/*
* btrfs_qgroup_inherit relies on a consistent view of the usage for the
@ -1509,7 +1529,7 @@ out:
* insert_dir_item()
*/
if (!ret)
record_root_in_trans(trans, parent, 1);
ret = record_root_in_trans(trans, parent, 1);
return ret;
}
@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root, 0);
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
goto fail;
cur_time = current_time(parent_inode);
/*
@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
record_root_in_trans(trans, root, 0);
ret = record_root_in_trans(trans, root, 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
*/
BUG_ON(list_empty(&cur_trans->list));
list_del_init(&cur_trans->list);
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_lock(&fs_info->trans_lock);
}
/*
* Now that we know no one else is still using the transaction we can
* remove the transaction from the list of transactions. This avoids
* the transaction kthread from cleaning up the transaction while some
* other task is still using it, which could result in a use-after-free
* on things like log trees, as it forces the transaction kthread to
* wait for this transaction to be cleaned up by us.
*/
list_del_init(&cur_trans->list);
spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info);

View File

@ -96,6 +96,13 @@ struct btrfs_transaction {
spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;
/*
* The number of bytes currently reserved, by all transaction handles
* attached to this transaction, for metadata extents of the chunk tree.
*/
atomic64_t chunk_bytes_reserved;
wait_queue_head_t chunk_reserve_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
inode->last_sub_trans = inode->root->log_transid;
inode->last_log_commit = inode->root->last_log_commit;
inode->last_log_commit = inode->last_sub_trans - 1;
spin_unlock(&inode->lock);
}

View File

@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf,
key->offset, fs_info->sectorsize);
return -EUCLEAN;
}
if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
extent_err(leaf, slot,
"invalid extent flag, data has full backref set");
return -EUCLEAN;
}
}
ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);

View File

@ -3165,20 +3165,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
if (btrfs_is_zoned(fs_info)) {
mutex_lock(&fs_info->tree_root->log_mutex);
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&log_root_tree->log_mutex);
mutex_unlock(&fs_info->tree_log_mutex);
goto out;
}
}
mutex_unlock(&fs_info->tree_root->log_mutex);
}
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
@ -6278,7 +6280,12 @@ again:
}
wc.replay_dest->log_root = log;
btrfs_record_root_in_trans(trans, wc.replay_dest);
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
if (ret)
/* The loop needs to continue due to the root refs */
btrfs_handle_fs_error(fs_info, ret,
"failed to record the log root in transaction");
else
ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {

929
fs/btrfs/tree-mod-log.c Normal file
View File

@ -0,0 +1,929 @@
// SPDX-License-Identifier: GPL-2.0
#include "tree-mod-log.h"
#include "disk-io.h"
struct tree_mod_root {
u64 logical;
u8 level;
};
struct tree_mod_elem {
struct rb_node node;
u64 logical;
u64 seq;
enum btrfs_mod_log_op op;
/*
* This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
* operations.
*/
int slot;
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
u64 generation;
/* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
struct btrfs_disk_key key;
u64 blockptr;
/* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
struct {
int dst_slot;
int nr_items;
} move;
/* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
struct tree_mod_root old_root;
};
/*
* Pull a new tree mod seq number for our operation.
*/
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
* before calling btrfs_get_tree_mod_seq.
* Returns a fresh, unused tree log modification sequence number, even if no new
* blocker was added.
*/
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct btrfs_seq_list *elem)
{
write_lock(&fs_info->tree_mod_log_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
}
write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct btrfs_seq_list *elem)
{
struct rb_root *tm_root;
struct rb_node *node;
struct rb_node *next;
struct tree_mod_elem *tm;
u64 min_seq = BTRFS_SEQ_LAST;
u64 seq_putting = elem->seq;
if (!seq_putting)
return;
write_lock(&fs_info->tree_mod_log_lock);
list_del(&elem->list);
elem->seq = 0;
if (list_empty(&fs_info->tree_mod_seq_list)) {
clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
} else {
struct btrfs_seq_list *first;
first = list_first_entry(&fs_info->tree_mod_seq_list,
struct btrfs_seq_list, list);
if (seq_putting > first->seq) {
/*
* Blocker with lower sequence number exists, we cannot
* remove anything from the log.
*/
write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = first->seq;
}
/*
* Anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = rb_entry(node, struct tree_mod_elem, node);
if (tm->seq >= min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
}
write_unlock(&fs_info->tree_mod_log_lock);
}
/*
* Key order of the log:
* node/leaf start address -> sequence
*
* The 'start address' is the logical address of the *new* root node for root
* replace operations, or the logical address of the affected block for all
* other operations.
*/
static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
struct tree_mod_elem *tm)
{
struct rb_root *tm_root;
struct rb_node **new;
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
tm->seq = btrfs_inc_tree_mod_seq(fs_info);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
while (*new) {
cur = rb_entry(*new, struct tree_mod_elem, node);
parent = *new;
if (cur->logical < tm->logical)
new = &((*new)->rb_left);
else if (cur->logical > tm->logical)
new = &((*new)->rb_right);
else if (cur->seq < tm->seq)
new = &((*new)->rb_left);
else if (cur->seq > tm->seq)
new = &((*new)->rb_right);
else
return -EEXIST;
}
rb_link_node(&tm->node, parent, new);
rb_insert_color(&tm->node, tm_root);
return 0;
}
/*
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
* returns false with the tree_mod_log_lock acquired. The caller must hold
* this until all tree mod log insertions are recorded in the rb tree and then
* write unlock fs_info::tree_mod_log_lock.
*/
static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
if (eb && btrfs_header_level(eb) == 0)
return true;
write_lock(&fs_info->tree_mod_log_lock);
if (list_empty(&(fs_info)->tree_mod_seq_list)) {
write_unlock(&fs_info->tree_mod_log_lock);
return true;
}
return false;
}
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
struct extent_buffer *eb)
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
if (eb && btrfs_header_level(eb) == 0)
return false;
return true;
}
static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
int slot,
enum btrfs_mod_log_op op,
gfp_t flags)
{
struct tree_mod_elem *tm;
tm = kzalloc(sizeof(*tm), flags);
if (!tm)
return NULL;
tm->logical = eb->start;
if (op != BTRFS_MOD_LOG_KEY_ADD) {
btrfs_node_key(eb, &tm->key, slot);
tm->blockptr = btrfs_node_blockptr(eb, slot);
}
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
RB_CLEAR_NODE(&tm->node);
return tm;
}
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op, gfp_t flags)
{
struct tree_mod_elem *tm;
int ret;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm = alloc_tree_mod_elem(eb, slot, op, flags);
if (!tm)
return -ENOMEM;
if (tree_mod_dont_log(eb->fs_info, eb)) {
kfree(tm);
return 0;
}
ret = tree_mod_log_insert(eb->fs_info, tm);
write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
kfree(tm);
return ret;
}
int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
int ret = 0;
int i;
bool locked = false;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
}
tm->logical = eb->start;
tm->slot = src_slot;
tm->move.dst_slot = dst_slot;
tm->move.nr_items = nr_items;
tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
locked = true;
/*
* When we override something during the move, we log these removals.
* This can only happen when we move towards the beginning of the
* buffer, i.e. dst_slot < src_slot.
*/
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
if (ret)
goto free_tms;
}
ret = tree_mod_log_insert(eb->fs_info, tm);
if (ret)
goto free_tms;
write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nr_items; i++) {
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
kfree(tm_list[i]);
}
if (locked)
write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
kfree(tm);
return ret;
}
static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct tree_mod_elem **tm_list,
int nritems)
{
int i, j;
int ret;
for (i = nritems - 1; i >= 0; i--) {
ret = tree_mod_log_insert(fs_info, tm_list[i]);
if (ret) {
for (j = nritems - 1; j > i; j--)
rb_erase(&tm_list[j]->node,
&fs_info->tree_mod_log);
return ret;
}
}
return 0;
}
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal)
{
struct btrfs_fs_info *fs_info = old_root->fs_info;
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
int ret = 0;
int i;
if (!tree_mod_need_log(fs_info, NULL))
return 0;
if (log_removal && btrfs_header_level(old_root) > 0) {
nritems = btrfs_header_nritems(old_root);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
GFP_NOFS);
if (!tm_list) {
ret = -ENOMEM;
goto free_tms;
}
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(old_root, i,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
}
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm) {
ret = -ENOMEM;
goto free_tms;
}
tm->logical = new_root->start;
tm->old_root.logical = old_root->start;
tm->old_root.level = btrfs_header_level(old_root);
tm->generation = btrfs_header_generation(old_root);
tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
if (tree_mod_dont_log(fs_info, NULL))
goto free_tms;
if (tm_list)
ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
if (!ret)
ret = tree_mod_log_insert(fs_info, tm);
write_unlock(&fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
return ret;
free_tms:
if (tm_list) {
for (i = 0; i < nritems; i++)
kfree(tm_list[i]);
kfree(tm_list);
}
kfree(tm);
return ret;
}
static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
u64 start, u64 min_seq,
bool smallest)
{
struct rb_root *tm_root;
struct rb_node *node;
struct tree_mod_elem *cur = NULL;
struct tree_mod_elem *found = NULL;
read_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
cur = rb_entry(node, struct tree_mod_elem, node);
if (cur->logical < start) {
node = node->rb_left;
} else if (cur->logical > start) {
node = node->rb_right;
} else if (cur->seq < min_seq) {
node = node->rb_left;
} else if (!smallest) {
/* We want the node with the highest seq */
if (found)
BUG_ON(found->seq > cur->seq);
found = cur;
node = node->rb_left;
} else if (cur->seq > min_seq) {
/* We want the node with the smallest seq */
if (found)
BUG_ON(found->seq < cur->seq);
found = cur;
node = node->rb_right;
} else {
found = cur;
break;
}
}
read_unlock(&fs_info->tree_mod_log_lock);
return found;
}
/*
* This returns the element from the log with the smallest time sequence
* value that's in the log (the oldest log item). Any element with a time
* sequence lower than min_seq will be ignored.
*/
static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
u64 start, u64 min_seq)
{
return __tree_mod_log_search(fs_info, start, min_seq, true);
}
/*
* This returns the element from the log with the largest time sequence
* value that's in the log (the most recent log item). Any element with
* a time sequence lower than min_seq will be ignored.
*/
static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
u64 start, u64 min_seq)
{
return __tree_mod_log_search(fs_info, start, min_seq, false);
}
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items)
{
struct btrfs_fs_info *fs_info = dst->fs_info;
int ret = 0;
struct tree_mod_elem **tm_list = NULL;
struct tree_mod_elem **tm_list_add, **tm_list_rem;
int i;
bool locked = false;
if (!tree_mod_need_log(fs_info, NULL))
return 0;
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
return 0;
tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
GFP_NOFS);
if (!tm_list)
return -ENOMEM;
tm_list_add = tm_list;
tm_list_rem = tm_list + nr_items;
for (i = 0; i < nr_items; i++) {
tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
if (!tm_list_rem[i]) {
ret = -ENOMEM;
goto free_tms;
}
tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
if (!tm_list_add[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
if (tree_mod_dont_log(fs_info, NULL))
goto free_tms;
locked = true;
for (i = 0; i < nr_items; i++) {
ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
if (ret)
goto free_tms;
ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
if (ret)
goto free_tms;
}
write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nr_items * 2; i++) {
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
kfree(tm_list[i]);
}
if (locked)
write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return ret;
}
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
{
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
int i;
int ret = 0;
if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
nritems = btrfs_header_nritems(eb);
tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
if (!tm_list)
return -ENOMEM;
for (i = 0; i < nritems; i++) {
tm_list[i] = alloc_tree_mod_elem(eb, i,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
if (!tm_list[i]) {
ret = -ENOMEM;
goto free_tms;
}
}
if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nritems; i++)
kfree(tm_list[i]);
kfree(tm_list);
return ret;
}
/*
* Returns the logical address of the oldest predecessor of the given root.
* Entries older than time_seq are ignored.
*/
static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
u64 time_seq)
{
struct tree_mod_elem *tm;
struct tree_mod_elem *found = NULL;
u64 root_logical = eb_root->start;
bool looped = false;
if (!time_seq)
return NULL;
/*
* The very last operation that's logged for a root is the replacement
* operation (if it is replaced at all). This has the logical address
* of the *new* root, making it the very first operation that's logged
* for this root.
*/
while (1) {
tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
time_seq);
if (!looped && !tm)
return NULL;
/*
* If there are no tree operation for the oldest root, we simply
* return it. This should only happen if that (old) root is at
* level 0.
*/
if (!tm)
break;
/*
* If there's an operation that's not a root replacement, we
* found the oldest version of our root. Normally, we'll find a
* BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
*/
if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
break;
found = tm;
root_logical = tm->old_root.logical;
looped = true;
}
/* If there's no old root to return, return what we found instead */
if (!found)
found = tm;
return found;
}
/*
* tm is a pointer to the first operation to rewind within eb. Then, all
* previous operations will be rewound (until we reach something older than
* time_seq).
*/
static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
u64 time_seq,
struct tree_mod_elem *first_tm)
{
u32 n;
struct rb_node *next;
struct tree_mod_elem *tm = first_tm;
unsigned long o_dst;
unsigned long o_src;
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
read_lock(&fs_info->tree_mod_log_lock);
while (tm && tm->seq >= time_seq) {
/*
* All the operations are recorded with the operator used for
* the modification. As we're going backwards, we do the
* opposite of each operation here.
*/
switch (tm->op) {
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
fallthrough;
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case BTRFS_MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
break;
case BTRFS_MOD_LOG_KEY_ADD:
/* if a move operation is needed it's in the log */
n--;
break;
case BTRFS_MOD_LOG_MOVE_KEYS:
o_dst = btrfs_node_key_ptr_offset(tm->slot);
o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
memmove_extent_buffer(eb, o_dst, o_src,
tm->move.nr_items * p_size);
break;
case BTRFS_MOD_LOG_ROOT_REPLACE:
/*
* This operation is special. For roots, this must be
* handled explicitly before rewinding.
* For non-roots, this operation may exist if the node
* was a root: root A -> child B; then A gets empty and
* B is promoted to the new root. In the mod log, we'll
* have a root-replace operation for B, a tree block
* that is no root. We simply ignore that operation.
*/
break;
}
next = rb_next(&tm->node);
if (!next)
break;
tm = rb_entry(next, struct tree_mod_elem, node);
if (tm->logical != first_tm->logical)
break;
}
read_unlock(&fs_info->tree_mod_log_lock);
btrfs_set_header_nritems(eb, n);
}
/*
* Called with eb read locked. If the buffer cannot be rewound, the same buffer
* is returned. If rewind operations happen, a fresh buffer is returned. The
* returned buffer is always read-locked. If the returned buffer is not the
* input buffer, the lock on the input buffer is released and the input buffer
* is freed (its refcount is decremented).
*/
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq)
{
struct extent_buffer *eb_rewin;
struct tree_mod_elem *tm;
if (!time_seq)
return eb;
if (btrfs_header_level(eb) == 0)
return eb;
tm = tree_mod_log_search(fs_info, eb->start, time_seq);
if (!tm)
return eb;
if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
btrfs_set_header_bytenr(eb_rewin, eb->start);
btrfs_set_header_backref_rev(eb_rewin,
btrfs_header_backref_rev(eb));
btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
} else {
eb_rewin = btrfs_clone_extent_buffer(eb);
if (!eb_rewin) {
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
return NULL;
}
}
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return eb_rewin;
}
/*
* Rewind the state of @root's root node to the given @time_seq value.
* If there are no changes, the current root->root_node is returned. If anything
* changed in between, there's a fresh buffer allocated on which the rewind
* operations are done. In any case, the returned buffer is read locked.
* Returns NULL on error (with no locks held).
*/
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
u64 eb_root_owner = 0;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
int level;
eb_root = btrfs_read_lock_root_node(root);
tm = tree_mod_log_oldest_root(eb_root, time_seq);
if (!tm)
return eb_root;
if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
old_root = &tm->old_root;
old_generation = tm->generation;
logical = old_root->logical;
level = old_root->level;
} else {
logical = eb_root->start;
level = btrfs_header_level(eb_root);
}
tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
old = read_tree_block(fs_info, logical, root->root_key.objectid,
0, level, NULL);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
btrfs_warn(fs_info,
"failed to read tree block %llu from get_old_root",
logical);
} else {
struct tree_mod_elem *tm2;
btrfs_tree_read_lock(old);
eb = btrfs_clone_extent_buffer(old);
/*
* After the lookup for the most recent tree mod operation
* above and before we locked and cloned the extent buffer
* 'old', a new tree mod log operation may have been added.
* So lookup for a more recent one to make sure the number
* of mod log operations we replay is consistent with the
* number of items we have in the cloned extent buffer,
* otherwise we can hit a BUG_ON when rewinding the extent
* buffer.
*/
tm2 = tree_mod_log_search(fs_info, logical, time_seq);
btrfs_tree_read_unlock(old);
free_extent_buffer(old);
ASSERT(tm2);
ASSERT(tm2 == tm || tm2->seq > tm->seq);
if (!tm2 || tm2->seq < tm->seq) {
free_extent_buffer(eb);
return NULL;
}
tm = tm2;
}
} else if (old_root) {
eb_root_owner = btrfs_header_owner(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
} else {
eb = btrfs_clone_extent_buffer(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
}
if (!eb)
return NULL;
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(eb, eb_root_owner);
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
btrfs_header_level(eb));
btrfs_tree_read_lock(eb);
if (tm)
tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
WARN_ON(btrfs_header_level(eb) != 0);
WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return eb;
}
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
{
struct tree_mod_elem *tm;
int level;
struct extent_buffer *eb_root = btrfs_root_node(root);
tm = tree_mod_log_oldest_root(eb_root, time_seq);
if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
level = tm->old_root.level;
else
level = btrfs_header_level(eb_root);
free_extent_buffer(eb_root);
return level;
}
/*
* Return the lowest sequence number in the tree modification log.
*
* Return the sequence number of the oldest tree modification log user, which
* corresponds to the lowest sequence number of all existing users. If there are
* no users it returns 0.
*/
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
{
u64 ret = 0;
read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct btrfs_seq_list *elem;
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct btrfs_seq_list, list);
ret = elem->seq;
}
read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}

53
fs/btrfs/tree-mod-log.h Normal file
View File

@ -0,0 +1,53 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef BTRFS_TREE_MOD_LOG_H
#define BTRFS_TREE_MOD_LOG_H
#include "ctree.h"
/* Represents a tree mod log user. */
struct btrfs_seq_list {
struct list_head list;
u64 seq;
};
#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
#define BTRFS_SEQ_LAST ((u64)-1)
enum btrfs_mod_log_op {
BTRFS_MOD_LOG_KEY_REPLACE,
BTRFS_MOD_LOG_KEY_ADD,
BTRFS_MOD_LOG_KEY_REMOVE,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING,
BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING,
BTRFS_MOD_LOG_MOVE_KEYS,
BTRFS_MOD_LOG_ROOT_REPLACE,
};
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct btrfs_seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct btrfs_seq_list *elem);
int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
struct extent_buffer *new_root,
bool log_removal);
int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op, gfp_t flags);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq);
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset,
int nr_items);
int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
int dst_slot, int src_slot,
int nr_items);
u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
#endif

View File

@ -1458,8 +1458,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
/* Given hole range was invalid (outside of device) */
if (ret == -ERANGE) {
*hole_start += *hole_size;
*hole_size = 0;
return 1;
*hole_size = false;
return true;
}
*hole_start += zone_size;
@ -3098,11 +3098,12 @@ out:
return ret;
}
static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *block_group;
u64 length;
int ret;
/*
@ -3117,7 +3118,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
lockdep_assert_held(&fs_info->reclaim_bgs_lock);
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
@ -3130,8 +3131,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (!block_group)
return -ENOENT;
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
length = block_group->length;
btrfs_put_block_group(block_group);
/*
* On a zoned file system, discard the whole block group, this will
* trigger a REQ_OP_ZONE_RESET operation on the device zone. If
* resetting the zone fails, don't treat it as a fatal problem from the
* filesystem's point of view.
*/
if (btrfs_is_zoned(fs_info)) {
ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
if (ret)
btrfs_info(fs_info,
"failed to reset zone %llu after relocation",
chunk_offset);
}
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@ -3172,10 +3188,10 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
mutex_lock(&fs_info->delete_unused_bgs_mutex);
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
@ -3183,7 +3199,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto error;
if (ret > 0)
@ -3204,7 +3220,7 @@ again:
else
BUG_ON(ret);
}
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (found_key.offset == 0)
break;
@ -3744,10 +3760,10 @@ again:
goto error;
}
mutex_lock(&fs_info->delete_unused_bgs_mutex);
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
@ -3761,7 +3777,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
ret = 0;
break;
}
@ -3771,7 +3787,7 @@ again:
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
break;
}
@ -3788,12 +3804,12 @@ again:
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
if (counting) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@ -3818,7 +3834,7 @@ again:
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
@ -3832,7 +3848,7 @@ again:
ret = btrfs_may_alloc_data_chunk(fs_info,
found_key.offset);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
} else if (ret == 1) {
chunk_reserved = 1;
@ -3840,7 +3856,7 @@ again:
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
} else if (ret == -ETXTBSY) {
@ -4725,16 +4741,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
mutex_lock(&fs_info->delete_unused_bgs_mutex);
mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto done;
ret = 0;
@ -4747,7 +4763,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@ -4756,7 +4772,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@ -4772,12 +4788,12 @@ again:
*/
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
} else if (ret) {
@ -4989,6 +5005,8 @@ static void init_alloc_chunk_ctl_policy_zoned(
ctl->max_chunk_size = 2 * ctl->max_stripe_size;
ctl->devs_max = min_t(int, ctl->devs_max,
BTRFS_MAX_DEVS_SYS_CHUNK);
} else {
BUG();
}
/* We don't want a chunk larger than 10% of writable space */
@ -6787,6 +6805,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
return div_u64(chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
/*
* Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
* can't be accessed on 32bit systems.
*
* This function do mount time check to reject the fs if it already has
* metadata chunk beyond that limit.
*/
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
u64 logical, u64 length, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_METADATA))
return 0;
if (logical + length < MAX_LFS_FILESIZE)
return 0;
btrfs_err_32bit_limit(fs_info);
return -EOVERFLOW;
}
/*
* This is to give early warning for any metadata chunk reaching
* BTRFS_32BIT_EARLY_WARN_THRESHOLD.
* Although we can still access the metadata, it's not going to be possible
* once the limit is reached.
*/
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
u64 logical, u64 length, u64 type)
{
if (!(type & BTRFS_BLOCK_GROUP_METADATA))
return;
if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
return;
btrfs_warn_32bit_limit(fs_info);
}
#endif
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
@ -6797,6 +6855,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
u64 logical;
u64 length;
u64 devid;
u64 type;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
int ret;
@ -6804,8 +6863,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
type = btrfs_chunk_type(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
#if BITS_PER_LONG == 32
ret = check_32bit_meta_chunk(fs_info, logical, length, type);
if (ret < 0)
return ret;
warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
@ -6849,10 +6916,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
em->orig_block_len = calc_stripe_length(map->type, em->len,
em->orig_block_len = calc_stripe_length(type, em->len,
map->num_stripes);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
@ -8001,7 +8068,7 @@ static int relocating_repair_kthread(void *data)
return -EBUSY;
}
mutex_lock(&fs_info->delete_unused_bgs_mutex);
mutex_lock(&fs_info->reclaim_bgs_lock);
/* Ensure block group still exists */
cache = btrfs_lookup_block_group(fs_info, target);
@ -8023,7 +8090,7 @@ static int relocating_repair_kthread(void *data)
out:
if (cache)
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
return ret;

View File

@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);

View File

@ -342,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
btrfs_err(fs_info, "zoned: device %pg does not support zone append",
bdev);
ret = -EINVAL;
goto out;
}
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;

View File

@ -9,6 +9,12 @@
#include "disk-io.h"
#include "block-group.h"
/*
* Block groups with more than this value (percents) of unusable space will be
* scheduled for background reclaim.
*/
#define BTRFS_DEFAULT_RECLAIM_THRESH 75
struct btrfs_zoned_device_info {
/*
* Number of zones, zone size and types of zones if bdev is a

View File

@ -981,6 +981,15 @@ static inline unsigned int readahead_count(struct readahead_control *rac)
return rac->_nr_pages;
}
/**
* readahead_batch_length - The number of bytes in the current batch.
* @rac: The readahead request.
*/
static inline loff_t readahead_batch_length(struct readahead_control *rac)
{
return rac->_batch_count * PAGE_SIZE;
}
static inline unsigned long dir_pages(struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>

View File

@ -1903,6 +1903,18 @@ DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group,
TP_ARGS(bg_cache)
);
DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group,
TP_PROTO(const struct btrfs_block_group *bg_cache),
TP_ARGS(bg_cache)
);
DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group,
TP_PROTO(const struct btrfs_block_group *bg_cache),
TP_ARGS(bg_cache)
);
DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group,
TP_PROTO(const struct btrfs_block_group *bg_cache),