2018-04-04 01:23:33 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2007-06-12 21:07:21 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
*/
|
2018-04-04 01:23:33 +08:00
|
|
|
|
2007-07-11 22:00:37 +08:00
|
|
|
#include <linux/sched.h>
|
Btrfs: fix sysfs warning and missing raid sysfs directories
In the 5.3 merge window, commit 7c7e301406d0a9 ("btrfs: sysfs: Replace
default_attrs in ktypes with groups"), we started using the member
"defaults_groups" for the kobject type "btrfs_raid_ktype". That leads
to a series of warnings when running some test cases of fstests, such
as btrfs/027, btrfs/124 and btrfs/176. The traces produced by those
warnings are like the following:
[116648.059212] kernfs: can not remove 'total_bytes', no directory
[116648.060112] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.066482] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.069376] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.072385] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.073437] RAX: 0000000000000000 RBX: ffffffffc0c11998 RCX: 0000000000000000
[116648.074201] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.074956] RBP: ffffffffc0b9ca2f R08: 0000000000000000 R09: 0000000000000001
[116648.075708] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.076434] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.077143] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.077852] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.078546] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.079235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.079907] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.080585] Call Trace:
[116648.081262] remove_files+0x31/0x70
[116648.081929] sysfs_remove_group+0x38/0x80
[116648.082596] sysfs_remove_groups+0x34/0x70
[116648.083258] kobject_del+0x20/0x60
[116648.083933] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.084608] close_ctree+0x19a/0x380 [btrfs]
[116648.085278] generic_shutdown_super+0x6c/0x110
[116648.085951] kill_anon_super+0xe/0x30
[116648.086621] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.087289] deactivate_locked_super+0x3a/0x70
[116648.087956] cleanup_mnt+0xb4/0x160
[116648.088620] task_work_run+0x7e/0xc0
[116648.089285] exit_to_usermode_loop+0xfa/0x100
[116648.089933] do_syscall_64+0x1cb/0x220
[116648.090567] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.091197] RIP: 0033:0x7f9cdc073b37
(...)
[116648.100046] ---[ end trace 22e24db328ccadf8 ]---
[116648.100618] ------------[ cut here ]------------
[116648.101175] kernfs: can not remove 'used_bytes', no directory
[116648.101731] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.105649] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.107461] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.109336] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.109979] RAX: 0000000000000000 RBX: ffffffffc0c119a0 RCX: 0000000000000000
[116648.110625] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.111283] RBP: ffffffffc0b9ca41 R08: 0000000000000000 R09: 0000000000000001
[116648.111940] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.112603] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.113268] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.113939] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.114607] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.115286] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.115966] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.116649] Call Trace:
[116648.117326] remove_files+0x31/0x70
[116648.117997] sysfs_remove_group+0x38/0x80
[116648.118671] sysfs_remove_groups+0x34/0x70
[116648.119342] kobject_del+0x20/0x60
[116648.120022] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.120707] close_ctree+0x19a/0x380 [btrfs]
[116648.121396] generic_shutdown_super+0x6c/0x110
[116648.122057] kill_anon_super+0xe/0x30
[116648.122702] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.123335] deactivate_locked_super+0x3a/0x70
[116648.123961] cleanup_mnt+0xb4/0x160
[116648.124586] task_work_run+0x7e/0xc0
[116648.125210] exit_to_usermode_loop+0xfa/0x100
[116648.125830] do_syscall_64+0x1cb/0x220
[116648.126463] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.127080] RIP: 0033:0x7f9cdc073b37
(...)
[116648.135923] ---[ end trace 22e24db328ccadf9 ]---
These happen because, during the unmount path, we call kobject_del() for
raid kobjects that are not fully initialized, meaning that we set their
ktype (as btrfs_raid_ktype) through link_block_group() but we didn't set
their parent kobject, which is done through btrfs_add_raid_kobjects().
We have this split raid kobject setup since commit 75cb379d263521
("btrfs: defer adding raid type kobject until after chunk relocation") in
order to avoid triggering reclaim during contextes where we can not
(either we are holding a transaction handle or some lock required by
the transaction commit path), so that we do the calls to kobject_add(),
which triggers GFP_KERNEL allocations, through btrfs_add_raid_kobjects()
in contextes where it is safe to trigger reclaim. That change expected
that a new raid kobject can only be created either when mounting the
filesystem or after raid profile conversion through the relocation path.
However, we can have new raid kobject created in other two cases at least:
1) During device replace (or scrub) after adding a device a to the
filesystem. The replace procedure (and scrub) do calls to
btrfs_inc_block_group_ro() which can allocate a new block group
with a new raid profile (because we now have more devices). This
can be triggered by test cases btrfs/027 and btrfs/176.
2) During a degraded mount trough any write path. This can be triggered
by test case btrfs/124.
Fixing this by adding extra calls to btrfs_add_raid_kobjects(), not only
makes things more complex and fragile, can also introduce deadlocks with
reclaim the following way:
1) Calling btrfs_add_raid_kobjects() at btrfs_inc_block_group_ro() or
anywhere in the replace/scrub path will cause a deadlock with reclaim
because if reclaim happens and a transaction commit is triggered,
the transaction commit path will block at btrfs_scrub_pause().
2) During degraded mounts it is essentially impossible to figure out where
to add extra calls to btrfs_add_raid_kobjects(), because allocation of
a block group with a new raid profile can happen anywhere, which means
we can't safely figure out which contextes are safe for reclaim, as
we can either hold a transaction handle or some lock needed by the
transaction commit path.
So it is too complex and error prone to have this split setup of raid
kobjects. So fix the issue by consolidating the setup of the kobjects in a
single place, at link_block_group(), and setup a nofs context there in
order to prevent reclaim being triggered by the memory allocations done
through the call chain of kobject_add().
Besides fixing the sysfs warnings during kobject_del(), this also ensures
the sysfs directories for the new raid profiles end up created and visible
to users (a bug that existed before the 5.3 commit 7c7e301406d0a9
("btrfs: sysfs: Replace default_attrs in ktypes with groups")).
Fixes: 75cb379d263521 ("btrfs: defer adding raid type kobject until after chunk relocation")
Fixes: 7c7e301406d0a9 ("btrfs: sysfs: Replace default_attrs in ktypes with groups")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-07 18:21:46 +08:00
|
|
|
#include <linux/sched/mm.h>
|
2017-02-04 06:47:37 +08:00
|
|
|
#include <linux/sched/signal.h>
|
2007-12-22 05:27:24 +08:00
|
|
|
#include <linux/pagemap.h>
|
2008-04-29 03:29:52 +08:00
|
|
|
#include <linux/writeback.h>
|
2008-08-12 21:13:26 +08:00
|
|
|
#include <linux/blkdev.h>
|
2009-02-04 22:23:45 +08:00
|
|
|
#include <linux/sort.h>
|
2009-03-11 00:39:20 +08:00
|
|
|
#include <linux/rcupdate.h>
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
#include <linux/kthread.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2011-06-14 18:52:17 +08:00
|
|
|
#include <linux/ratelimit.h>
|
2013-06-20 03:00:04 +08:00
|
|
|
#include <linux/percpu_counter.h>
|
2017-10-20 02:15:57 +08:00
|
|
|
#include <linux/lockdep.h>
|
btrfs: Remove custom crc32c init code
The custom crc32 init code was introduced in
14a958e678cd ("Btrfs: fix btrfs boot when compiled as built-in") to
enable using btrfs as a built-in. However, later as pointed out by
60efa5eb2e88 ("Btrfs: use late_initcall instead of module_init") this
wasn't enough and finally btrfs was switched to late_initcall which
comes after the generic crc32c implementation is initiliased. The
latter commit superseeded the former. Now that we don't have to
maintain our own code let's just remove it and switch to using the
generic implementation.
Despite touching a lot of files the patch is really simple. Here is the gist of
the changes:
1. Select LIBCRC32C rather than the low-level modules.
2. s/btrfs_crc32c/crc32c/g
3. replace hash.h with linux/crc32c.h
4. Move the btrfs namehash funcs to ctree.h and change the tree accordingly.
I've tested this with btrfs being both a module and a built-in and xfstest
doesn't complain.
Does seem to fix the longstanding problem of not automatically selectiong
the crc32c module when btrfs is used. Possibly there is a workaround in
dracut.
The modinfo confirms that now all the module dependencies are there:
before:
depends: zstd_compress,zstd_decompress,raid6_pq,xor,zlib_deflate
after:
depends: libcrc32c,zstd_compress,zstd_decompress,raid6_pq,xor,zlib_deflate
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add more info to changelog from mails ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-01-08 17:45:05 +08:00
|
|
|
#include <linux/crc32c.h>
|
2014-04-02 19:51:06 +08:00
|
|
|
#include "tree-log.h"
|
2007-02-26 23:40:21 +08:00
|
|
|
#include "disk-io.h"
|
|
|
|
#include "print-tree.h"
|
2008-03-25 03:01:56 +08:00
|
|
|
#include "volumes.h"
|
2013-01-30 07:40:14 +08:00
|
|
|
#include "raid56.h"
|
2008-06-26 04:01:30 +08:00
|
|
|
#include "locking.h"
|
2009-04-03 21:47:43 +08:00
|
|
|
#include "free-space-cache.h"
|
2015-09-30 11:50:37 +08:00
|
|
|
#include "free-space-tree.h"
|
2012-09-13 18:51:36 +08:00
|
|
|
#include "math.h"
|
2013-11-02 01:07:04 +08:00
|
|
|
#include "sysfs.h"
|
2014-05-14 08:30:47 +08:00
|
|
|
#include "qgroup.h"
|
2017-09-30 03:43:50 +08:00
|
|
|
#include "ref-verify.h"
|
2019-06-19 04:09:16 +08:00
|
|
|
#include "space-info.h"
|
2019-06-20 01:47:17 +08:00
|
|
|
#include "block-rsv.h"
|
2019-06-20 03:12:00 +08:00
|
|
|
#include "delalloc-space.h"
|
2007-02-26 23:40:21 +08:00
|
|
|
|
2011-09-12 18:22:57 +08:00
|
|
|
#undef SCRAMBLE_DELAYED_REFS
|
|
|
|
|
2018-10-24 20:24:01 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
|
2018-06-20 20:48:57 +08:00
|
|
|
struct btrfs_delayed_ref_node *node, u64 parent,
|
|
|
|
u64 root_objectid, u64 owner_objectid,
|
|
|
|
u64 owner_offset, int refs_to_drop,
|
|
|
|
struct btrfs_delayed_extent_op *extra_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
|
|
|
|
struct extent_buffer *leaf,
|
|
|
|
struct btrfs_extent_item *ei);
|
|
|
|
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
u64 parent, u64 root_objectid,
|
|
|
|
u64 flags, u64 owner, u64 offset,
|
|
|
|
struct btrfs_key *ins, int ref_mod);
|
|
|
|
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
|
2018-05-21 17:27:21 +08:00
|
|
|
struct btrfs_delayed_ref_node *node,
|
2018-05-21 17:27:22 +08:00
|
|
|
struct btrfs_delayed_extent_op *extent_op);
|
2009-09-12 04:11:19 +08:00
|
|
|
static int find_next_key(struct btrfs_path *path, int level,
|
|
|
|
struct btrfs_key *key);
|
2009-02-21 00:00:09 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
static noinline int
|
|
|
|
block_group_cache_done(struct btrfs_block_group_cache *cache)
|
|
|
|
{
|
|
|
|
smp_mb();
|
2013-08-05 23:15:21 +08:00
|
|
|
return cache->cached == BTRFS_CACHE_FINISHED ||
|
|
|
|
cache->cached == BTRFS_CACHE_ERROR;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
|
|
|
|
{
|
|
|
|
return (cache->flags & bits) == bits;
|
|
|
|
}
|
|
|
|
|
2015-11-19 19:45:48 +08:00
|
|
|
void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
|
2009-11-14 04:12:59 +08:00
|
|
|
{
|
|
|
|
atomic_inc(&cache->count);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
|
|
|
|
{
|
2010-05-16 22:46:25 +08:00
|
|
|
if (atomic_dec_and_test(&cache->count)) {
|
|
|
|
WARN_ON(cache->pinned > 0);
|
|
|
|
WARN_ON(cache->reserved > 0);
|
2017-04-14 08:35:54 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If not empty, someone is still holding mutex of
|
|
|
|
* full_stripe_lock, which can only be released by caller.
|
|
|
|
* And it will definitely cause use-after-free when caller
|
|
|
|
* tries to release full stripe lock.
|
|
|
|
*
|
|
|
|
* No better way to resolve, but only to warn.
|
|
|
|
*/
|
|
|
|
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
|
2011-03-29 13:46:06 +08:00
|
|
|
kfree(cache->free_space_ctl);
|
2009-11-14 04:12:59 +08:00
|
|
|
kfree(cache);
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
2009-11-14 04:12:59 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
/*
|
|
|
|
* this adds the block group to the fs_info rb tree for the block group
|
|
|
|
* cache
|
|
|
|
*/
|
2008-12-02 22:54:17 +08:00
|
|
|
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
struct btrfs_block_group_cache *block_group)
|
|
|
|
{
|
|
|
|
struct rb_node **p;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct btrfs_block_group_cache *cache;
|
|
|
|
|
|
|
|
spin_lock(&info->block_group_cache_lock);
|
|
|
|
p = &info->block_group_cache_tree.rb_node;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
cache = rb_entry(parent, struct btrfs_block_group_cache,
|
|
|
|
cache_node);
|
|
|
|
if (block_group->key.objectid < cache->key.objectid) {
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
} else if (block_group->key.objectid > cache->key.objectid) {
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
} else {
|
|
|
|
spin_unlock(&info->block_group_cache_lock);
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&block_group->cache_node, parent, p);
|
|
|
|
rb_insert_color(&block_group->cache_node,
|
|
|
|
&info->block_group_cache_tree);
|
2012-12-27 17:01:23 +08:00
|
|
|
|
|
|
|
if (info->first_logical_byte > block_group->key.objectid)
|
|
|
|
info->first_logical_byte = block_group->key.objectid;
|
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
spin_unlock(&info->block_group_cache_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This will return the block group at or after bytenr if contains is 0, else
|
|
|
|
* it will return the block group that contains the bytenr
|
|
|
|
*/
|
|
|
|
static struct btrfs_block_group_cache *
|
|
|
|
block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
|
|
|
|
int contains)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache, *ret = NULL;
|
|
|
|
struct rb_node *n;
|
|
|
|
u64 end, start;
|
|
|
|
|
|
|
|
spin_lock(&info->block_group_cache_lock);
|
|
|
|
n = info->block_group_cache_tree.rb_node;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
cache = rb_entry(n, struct btrfs_block_group_cache,
|
|
|
|
cache_node);
|
|
|
|
end = cache->key.objectid + cache->key.offset - 1;
|
|
|
|
start = cache->key.objectid;
|
|
|
|
|
|
|
|
if (bytenr < start) {
|
|
|
|
if (!contains && (!ret || start < ret->key.objectid))
|
|
|
|
ret = cache;
|
|
|
|
n = n->rb_left;
|
|
|
|
} else if (bytenr > start) {
|
|
|
|
if (contains && bytenr <= end) {
|
|
|
|
ret = cache;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
n = n->rb_right;
|
|
|
|
} else {
|
|
|
|
ret = cache;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2012-12-27 17:01:23 +08:00
|
|
|
if (ret) {
|
2009-11-14 04:12:59 +08:00
|
|
|
btrfs_get_block_group(ret);
|
2012-12-27 17:01:23 +08:00
|
|
|
if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
|
|
|
|
info->first_logical_byte = ret->key.objectid;
|
|
|
|
}
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
spin_unlock(&info->block_group_cache_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static int add_excluded_extent(struct btrfs_fs_info *fs_info,
|
2009-09-12 04:11:19 +08:00
|
|
|
u64 start, u64 num_bytes)
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
{
|
2009-09-12 04:11:19 +08:00
|
|
|
u64 end = start + num_bytes - 1;
|
2016-06-23 06:54:23 +08:00
|
|
|
set_extent_bits(&fs_info->freed_extents[0],
|
2016-04-27 05:54:39 +08:00
|
|
|
start, end, EXTENT_UPTODATE);
|
2016-06-23 06:54:23 +08:00
|
|
|
set_extent_bits(&fs_info->freed_extents[1],
|
2016-04-27 05:54:39 +08:00
|
|
|
start, end, EXTENT_UPTODATE);
|
2009-09-12 04:11:19 +08:00
|
|
|
return 0;
|
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2018-06-20 20:49:08 +08:00
|
|
|
static void free_excluded_extents(struct btrfs_block_group_cache *cache)
|
2009-09-12 04:11:19 +08:00
|
|
|
{
|
2018-06-20 20:49:08 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
2009-09-12 04:11:19 +08:00
|
|
|
u64 start, end;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
start = cache->key.objectid;
|
|
|
|
end = start + cache->key.offset - 1;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
clear_extent_bits(&fs_info->freed_extents[0],
|
2016-04-27 05:54:39 +08:00
|
|
|
start, end, EXTENT_UPTODATE);
|
2016-06-23 06:54:23 +08:00
|
|
|
clear_extent_bits(&fs_info->freed_extents[1],
|
2016-04-27 05:54:39 +08:00
|
|
|
start, end, EXTENT_UPTODATE);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
|
|
|
|
2018-06-20 20:49:09 +08:00
|
|
|
static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
{
|
2018-06-20 20:49:09 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
u64 bytenr;
|
|
|
|
u64 *logical;
|
|
|
|
int stripe_len;
|
|
|
|
int i, nr, ret;
|
|
|
|
|
2009-11-26 17:31:11 +08:00
|
|
|
if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
|
|
|
|
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
|
|
|
|
cache->bytes_super += stripe_len;
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = add_excluded_extent(fs_info, cache->key.objectid,
|
2009-11-26 17:31:11 +08:00
|
|
|
stripe_len);
|
2013-03-20 00:13:25 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2009-11-26 17:31:11 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
|
|
|
|
bytenr = btrfs_sb_offset(i);
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_rmap_block(fs_info, cache->key.objectid,
|
2018-05-04 15:53:05 +08:00
|
|
|
bytenr, &logical, &nr, &stripe_len);
|
2013-03-20 00:13:25 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2009-09-12 04:11:19 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
while (nr--) {
|
2013-04-24 00:55:21 +08:00
|
|
|
u64 start, len;
|
|
|
|
|
|
|
|
if (logical[nr] > cache->key.objectid +
|
|
|
|
cache->key.offset)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (logical[nr] + stripe_len <= cache->key.objectid)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
start = logical[nr];
|
|
|
|
if (start < cache->key.objectid) {
|
|
|
|
start = cache->key.objectid;
|
|
|
|
len = (logical[nr] + stripe_len) - start;
|
|
|
|
} else {
|
|
|
|
len = min_t(u64, stripe_len,
|
|
|
|
cache->key.objectid +
|
|
|
|
cache->key.offset - start);
|
|
|
|
}
|
|
|
|
|
|
|
|
cache->bytes_super += len;
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = add_excluded_extent(fs_info, start, len);
|
2013-03-20 00:13:25 +08:00
|
|
|
if (ret) {
|
|
|
|
kfree(logical);
|
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
2009-09-12 04:11:19 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
kfree(logical);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
static struct btrfs_caching_control *
|
|
|
|
get_caching_control(struct btrfs_block_group_cache *cache)
|
|
|
|
{
|
|
|
|
struct btrfs_caching_control *ctl;
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
2010-09-17 04:17:03 +08:00
|
|
|
if (!cache->caching_ctl) {
|
|
|
|
spin_unlock(&cache->lock);
|
2009-09-12 04:11:19 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctl = cache->caching_ctl;
|
2017-03-03 16:55:14 +08:00
|
|
|
refcount_inc(&ctl->count);
|
2009-09-12 04:11:19 +08:00
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
return ctl;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void put_caching_control(struct btrfs_caching_control *ctl)
|
|
|
|
{
|
2017-03-03 16:55:14 +08:00
|
|
|
if (refcount_dec_and_test(&ctl->count))
|
2009-09-12 04:11:19 +08:00
|
|
|
kfree(ctl);
|
|
|
|
}
|
|
|
|
|
2015-09-24 02:54:14 +08:00
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
2016-06-23 06:54:24 +08:00
|
|
|
static void fragment_free_space(struct btrfs_block_group_cache *block_group)
|
2015-09-24 02:54:14 +08:00
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
struct btrfs_fs_info *fs_info = block_group->fs_info;
|
2015-09-24 02:54:14 +08:00
|
|
|
u64 start = block_group->key.objectid;
|
|
|
|
u64 len = block_group->key.offset;
|
|
|
|
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
|
2016-06-23 06:54:23 +08:00
|
|
|
fs_info->nodesize : fs_info->sectorsize;
|
2015-09-24 02:54:14 +08:00
|
|
|
u64 step = chunk << 1;
|
|
|
|
|
|
|
|
while (len > chunk) {
|
|
|
|
btrfs_remove_free_space(block_group, start, chunk);
|
|
|
|
start += step;
|
|
|
|
if (len < step)
|
|
|
|
len = 0;
|
|
|
|
else
|
|
|
|
len -= step;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
/*
|
|
|
|
* this is only called by cache_block_group, since we could have freed extents
|
|
|
|
* we need to check the pinned_extents for any extents that can't be used yet
|
|
|
|
* since their free space will be released as soon as the transaction commits.
|
|
|
|
*/
|
2015-09-30 11:50:35 +08:00
|
|
|
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
|
2018-05-10 20:44:45 +08:00
|
|
|
u64 start, u64 end)
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
{
|
2018-05-10 20:44:45 +08:00
|
|
|
struct btrfs_fs_info *info = block_group->fs_info;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
u64 extent_start, extent_end, size, total_added = 0;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (start < end) {
|
2009-09-12 04:11:19 +08:00
|
|
|
ret = find_first_extent_bit(info->pinned_extents, start,
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
&extent_start, &extent_end,
|
2012-09-28 05:07:30 +08:00
|
|
|
EXTENT_DIRTY | EXTENT_UPTODATE,
|
|
|
|
NULL);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
2009-11-26 17:31:11 +08:00
|
|
|
if (extent_start <= start) {
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
start = extent_end + 1;
|
|
|
|
} else if (extent_start > start && extent_start < end) {
|
|
|
|
size = extent_start - start;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
total_added += size;
|
2008-11-21 01:16:16 +08:00
|
|
|
ret = btrfs_add_free_space(block_group, start,
|
|
|
|
size);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM or logic error */
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
start = extent_end + 1;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start < end) {
|
|
|
|
size = end - start;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
total_added += size;
|
2008-11-21 01:16:16 +08:00
|
|
|
ret = btrfs_add_free_space(block_group, start, size);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM or logic error */
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
return total_added;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
}
|
|
|
|
|
2015-09-30 11:50:33 +08:00
|
|
|
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
|
2007-05-10 08:13:14 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
|
|
|
|
struct btrfs_fs_info *fs_info = block_group->fs_info;
|
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
2007-05-10 08:13:14 +08:00
|
|
|
struct btrfs_path *path;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_key key;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
u64 total_found = 0;
|
2009-09-12 04:11:19 +08:00
|
|
|
u64 last = 0;
|
|
|
|
u32 nritems;
|
2015-09-30 11:50:33 +08:00
|
|
|
int ret;
|
2015-09-24 02:54:14 +08:00
|
|
|
bool wakeup = true;
|
2007-10-16 04:14:48 +08:00
|
|
|
|
2007-05-10 08:13:14 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
2015-09-30 11:50:33 +08:00
|
|
|
return -ENOMEM;
|
2007-09-15 04:15:28 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2015-09-24 02:54:14 +08:00
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
/*
|
|
|
|
* If we're fragmenting we don't want to make anybody think we can
|
|
|
|
* allocate from this block group until we've had a chance to fragment
|
|
|
|
* the free space.
|
|
|
|
*/
|
2016-06-23 06:54:24 +08:00
|
|
|
if (btrfs_should_fragment_free_space(block_group))
|
2015-09-24 02:54:14 +08:00
|
|
|
wakeup = false;
|
|
|
|
#endif
|
2008-06-26 04:01:30 +08:00
|
|
|
/*
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
* We don't want to deadlock with somebody trying to allocate a new
|
|
|
|
* extent for the extent root while also trying to search the extent
|
|
|
|
* root to add free space. So we skip locking and search the commit
|
|
|
|
* root, since its read-only
|
2008-06-26 04:01:30 +08:00
|
|
|
*/
|
|
|
|
path->skip_locking = 1;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
path->search_commit_root = 1;
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2008-12-12 23:03:26 +08:00
|
|
|
key.objectid = last;
|
2007-05-10 08:13:14 +08:00
|
|
|
key.offset = 0;
|
2009-09-12 04:11:19 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
2009-08-01 02:57:55 +08:00
|
|
|
|
2013-07-11 17:51:15 +08:00
|
|
|
next:
|
2009-09-12 04:11:19 +08:00
|
|
|
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
|
2007-05-10 08:13:14 +08:00
|
|
|
if (ret < 0)
|
2015-09-30 11:50:33 +08:00
|
|
|
goto out;
|
2008-12-09 05:46:26 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
nritems = btrfs_header_nritems(leaf);
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2011-06-01 00:07:27 +08:00
|
|
|
if (btrfs_fs_closing(fs_info) > 1) {
|
2009-07-28 20:41:57 +08:00
|
|
|
last = (u64)-1;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
break;
|
2009-07-28 20:41:57 +08:00
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
if (path->slots[0] < nritems) {
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
|
|
|
} else {
|
|
|
|
ret = find_next_key(path, 0, &key);
|
|
|
|
if (ret)
|
2007-05-10 08:13:14 +08:00
|
|
|
break;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2013-09-19 22:02:11 +08:00
|
|
|
if (need_resched() ||
|
2014-03-14 03:42:13 +08:00
|
|
|
rwsem_is_contended(&fs_info->commit_root_sem)) {
|
2015-09-24 02:54:14 +08:00
|
|
|
if (wakeup)
|
|
|
|
caching_ctl->progress = last;
|
2011-05-28 19:00:39 +08:00
|
|
|
btrfs_release_path(path);
|
2014-03-14 03:42:13 +08:00
|
|
|
up_read(&fs_info->commit_root_sem);
|
2011-05-12 05:30:53 +08:00
|
|
|
mutex_unlock(&caching_ctl->mutex);
|
2009-09-12 04:11:19 +08:00
|
|
|
cond_resched();
|
2015-09-30 11:50:33 +08:00
|
|
|
mutex_lock(&caching_ctl->mutex);
|
|
|
|
down_read(&fs_info->commit_root_sem);
|
|
|
|
goto next;
|
2011-05-12 05:30:53 +08:00
|
|
|
}
|
2013-04-20 02:37:26 +08:00
|
|
|
|
|
|
|
ret = btrfs_next_leaf(extent_root, path);
|
|
|
|
if (ret < 0)
|
2015-09-30 11:50:33 +08:00
|
|
|
goto out;
|
2013-04-20 02:37:26 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
2011-05-12 05:30:53 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
nritems = btrfs_header_nritems(leaf);
|
|
|
|
continue;
|
2009-09-12 04:11:19 +08:00
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2013-07-11 17:51:15 +08:00
|
|
|
if (key.objectid < last) {
|
|
|
|
key.objectid = last;
|
|
|
|
key.offset = 0;
|
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
|
2015-09-24 02:54:14 +08:00
|
|
|
if (wakeup)
|
|
|
|
caching_ctl->progress = last;
|
2013-07-11 17:51:15 +08:00
|
|
|
btrfs_release_path(path);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
if (key.objectid < block_group->key.objectid) {
|
|
|
|
path->slots[0]++;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
continue;
|
2007-05-10 08:13:14 +08:00
|
|
|
}
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2007-05-10 08:13:14 +08:00
|
|
|
if (key.objectid >= block_group->key.objectid +
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
block_group->key.offset)
|
2007-05-10 08:13:14 +08:00
|
|
|
break;
|
2007-09-15 04:15:28 +08:00
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
|
|
|
|
key.type == BTRFS_METADATA_ITEM_KEY) {
|
2018-05-10 20:44:45 +08:00
|
|
|
total_found += add_new_free_space(block_group, last,
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
key.objectid);
|
2013-03-08 03:22:04 +08:00
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY)
|
|
|
|
last = key.objectid +
|
2016-06-15 21:22:56 +08:00
|
|
|
fs_info->nodesize;
|
2013-03-08 03:22:04 +08:00
|
|
|
else
|
|
|
|
last = key.objectid + key.offset;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2015-09-30 11:50:33 +08:00
|
|
|
if (total_found > CACHING_CTL_WAKE_UP) {
|
2009-09-12 04:11:19 +08:00
|
|
|
total_found = 0;
|
2015-09-24 02:54:14 +08:00
|
|
|
if (wakeup)
|
|
|
|
wake_up(&caching_ctl->wait);
|
2009-09-12 04:11:19 +08:00
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
2007-05-10 08:13:14 +08:00
|
|
|
path->slots[0]++;
|
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
ret = 0;
|
2007-05-10 08:13:14 +08:00
|
|
|
|
2018-05-10 20:44:45 +08:00
|
|
|
total_found += add_new_free_space(block_group, last,
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
block_group->key.objectid +
|
|
|
|
block_group->key.offset);
|
2009-09-12 04:11:19 +08:00
|
|
|
caching_ctl->progress = (u64)-1;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2015-09-30 11:50:33 +08:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline void caching_thread(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
struct btrfs_caching_control *caching_ctl;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
caching_ctl = container_of(work, struct btrfs_caching_control, work);
|
|
|
|
block_group = caching_ctl->block_group;
|
|
|
|
fs_info = block_group->fs_info;
|
|
|
|
|
|
|
|
mutex_lock(&caching_ctl->mutex);
|
|
|
|
down_read(&fs_info->commit_root_sem);
|
|
|
|
|
2015-09-30 11:50:37 +08:00
|
|
|
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
|
|
|
|
ret = load_free_space_tree(caching_ctl);
|
|
|
|
else
|
|
|
|
ret = load_extent_tree_free(caching_ctl);
|
2015-09-30 11:50:33 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
spin_lock(&block_group->lock);
|
2009-09-12 04:11:19 +08:00
|
|
|
block_group->caching_ctl = NULL;
|
2015-09-30 11:50:33 +08:00
|
|
|
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
spin_unlock(&block_group->lock);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2015-09-24 02:54:14 +08:00
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
2016-06-23 06:54:24 +08:00
|
|
|
if (btrfs_should_fragment_free_space(block_group)) {
|
2015-09-24 02:54:14 +08:00
|
|
|
u64 bytes_used;
|
|
|
|
|
|
|
|
spin_lock(&block_group->space_info->lock);
|
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
bytes_used = block_group->key.offset -
|
|
|
|
btrfs_block_group_used(&block_group->item);
|
|
|
|
block_group->space_info->bytes_used += bytes_used >> 1;
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
spin_unlock(&block_group->space_info->lock);
|
2016-06-23 06:54:24 +08:00
|
|
|
fragment_free_space(block_group);
|
2015-09-24 02:54:14 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
caching_ctl->progress = (u64)-1;
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2014-03-14 03:42:13 +08:00
|
|
|
up_read(&fs_info->commit_root_sem);
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(block_group);
|
2009-09-12 04:11:19 +08:00
|
|
|
mutex_unlock(&caching_ctl->mutex);
|
2015-09-30 11:50:33 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
wake_up(&caching_ctl->wait);
|
|
|
|
|
|
|
|
put_caching_control(caching_ctl);
|
2009-11-14 04:12:59 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
|
|
|
|
2010-08-26 04:54:15 +08:00
|
|
|
static int cache_block_group(struct btrfs_block_group_cache *cache,
|
|
|
|
int load_cache_only)
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
{
|
2011-11-15 02:52:14 +08:00
|
|
|
DEFINE_WAIT(wait);
|
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
|
|
|
struct btrfs_caching_control *caching_ctl;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2011-11-15 02:52:14 +08:00
|
|
|
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (!caching_ctl)
|
|
|
|
return -ENOMEM;
|
2011-11-15 02:52:14 +08:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&caching_ctl->list);
|
|
|
|
mutex_init(&caching_ctl->mutex);
|
|
|
|
init_waitqueue_head(&caching_ctl->wait);
|
|
|
|
caching_ctl->block_group = cache;
|
|
|
|
caching_ctl->progress = cache->key.objectid;
|
2017-03-03 16:55:14 +08:00
|
|
|
refcount_set(&caching_ctl->count, 1);
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
|
|
|
|
caching_thread, NULL, NULL);
|
2011-11-15 02:52:14 +08:00
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
/*
|
|
|
|
* This should be a rare occasion, but this could happen I think in the
|
|
|
|
* case where one thread starts to load the space cache info, and then
|
|
|
|
* some other thread starts a transaction commit which tries to do an
|
|
|
|
* allocation while the other thread is still loading the space cache
|
|
|
|
* info. The previous loop should have kept us from choosing this block
|
|
|
|
* group, but if we've moved to the state where we will wait on caching
|
|
|
|
* block groups we need to first check if we're doing a fast load here,
|
|
|
|
* so we can wait for it to finish, otherwise we could end up allocating
|
|
|
|
* from a block group who's cache gets evicted for one reason or
|
|
|
|
* another.
|
|
|
|
*/
|
|
|
|
while (cache->cached == BTRFS_CACHE_FAST) {
|
|
|
|
struct btrfs_caching_control *ctl;
|
|
|
|
|
|
|
|
ctl = cache->caching_ctl;
|
2017-03-03 16:55:14 +08:00
|
|
|
refcount_inc(&ctl->count);
|
2011-11-15 02:52:14 +08:00
|
|
|
prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
schedule();
|
|
|
|
|
|
|
|
finish_wait(&ctl->wait, &wait);
|
|
|
|
put_caching_control(ctl);
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cache->cached != BTRFS_CACHE_NO) {
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
kfree(caching_ctl);
|
2009-09-12 04:11:19 +08:00
|
|
|
return 0;
|
2011-11-15 02:52:14 +08:00
|
|
|
}
|
|
|
|
WARN_ON(cache->caching_ctl);
|
|
|
|
cache->caching_ctl = caching_ctl;
|
|
|
|
cache->cached = BTRFS_CACHE_FAST;
|
|
|
|
spin_unlock(&cache->lock);
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2017-09-12 19:08:08 +08:00
|
|
|
if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
|
2014-11-27 00:52:54 +08:00
|
|
|
mutex_lock(&caching_ctl->mutex);
|
2019-03-20 20:47:15 +08:00
|
|
|
ret = load_free_space_cache(cache);
|
2010-08-26 04:54:15 +08:00
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
if (ret == 1) {
|
2011-11-15 02:52:14 +08:00
|
|
|
cache->caching_ctl = NULL;
|
2010-08-26 04:54:15 +08:00
|
|
|
cache->cached = BTRFS_CACHE_FINISHED;
|
|
|
|
cache->last_byte_to_unpin = (u64)-1;
|
2014-11-27 00:52:54 +08:00
|
|
|
caching_ctl->progress = (u64)-1;
|
2010-08-26 04:54:15 +08:00
|
|
|
} else {
|
2011-11-15 02:52:14 +08:00
|
|
|
if (load_cache_only) {
|
|
|
|
cache->caching_ctl = NULL;
|
|
|
|
cache->cached = BTRFS_CACHE_NO;
|
|
|
|
} else {
|
|
|
|
cache->cached = BTRFS_CACHE_STARTED;
|
2014-11-26 23:28:51 +08:00
|
|
|
cache->has_caching_ctl = 1;
|
2011-11-15 02:52:14 +08:00
|
|
|
}
|
2010-08-26 04:54:15 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&cache->lock);
|
2015-09-24 02:54:14 +08:00
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
if (ret == 1 &&
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_should_fragment_free_space(cache)) {
|
2015-09-24 02:54:14 +08:00
|
|
|
u64 bytes_used;
|
|
|
|
|
|
|
|
spin_lock(&cache->space_info->lock);
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
bytes_used = cache->key.offset -
|
|
|
|
btrfs_block_group_used(&cache->item);
|
|
|
|
cache->space_info->bytes_used += bytes_used >> 1;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&cache->space_info->lock);
|
2016-06-23 06:54:24 +08:00
|
|
|
fragment_free_space(cache);
|
2015-09-24 02:54:14 +08:00
|
|
|
}
|
|
|
|
#endif
|
2014-11-27 00:52:54 +08:00
|
|
|
mutex_unlock(&caching_ctl->mutex);
|
|
|
|
|
2011-11-15 02:52:14 +08:00
|
|
|
wake_up(&caching_ctl->wait);
|
2011-02-02 23:53:47 +08:00
|
|
|
if (ret == 1) {
|
2011-11-15 02:52:14 +08:00
|
|
|
put_caching_control(caching_ctl);
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(cache);
|
2010-08-26 04:54:15 +08:00
|
|
|
return 0;
|
2011-02-02 23:53:47 +08:00
|
|
|
}
|
2011-11-15 02:52:14 +08:00
|
|
|
} else {
|
|
|
|
/*
|
2015-09-30 11:50:37 +08:00
|
|
|
* We're either using the free space tree or no caching at all.
|
|
|
|
* Set cached to the appropriate value and wakeup any waiters.
|
2011-11-15 02:52:14 +08:00
|
|
|
*/
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
if (load_cache_only) {
|
|
|
|
cache->caching_ctl = NULL;
|
|
|
|
cache->cached = BTRFS_CACHE_NO;
|
|
|
|
} else {
|
|
|
|
cache->cached = BTRFS_CACHE_STARTED;
|
2014-11-26 23:28:51 +08:00
|
|
|
cache->has_caching_ctl = 1;
|
2011-11-15 02:52:14 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
wake_up(&caching_ctl->wait);
|
2010-08-26 04:54:15 +08:00
|
|
|
}
|
|
|
|
|
2011-11-15 02:52:14 +08:00
|
|
|
if (load_cache_only) {
|
|
|
|
put_caching_control(caching_ctl);
|
2009-09-12 04:11:19 +08:00
|
|
|
return 0;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
|
|
|
|
2014-03-14 03:42:13 +08:00
|
|
|
down_write(&fs_info->commit_root_sem);
|
2017-03-03 16:55:14 +08:00
|
|
|
refcount_inc(&caching_ctl->count);
|
2009-09-12 04:11:19 +08:00
|
|
|
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
|
2014-03-14 03:42:13 +08:00
|
|
|
up_write(&fs_info->commit_root_sem);
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2009-11-14 04:12:59 +08:00
|
|
|
btrfs_get_block_group(cache);
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2014-02-28 10:46:12 +08:00
|
|
|
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2008-09-24 01:14:11 +08:00
|
|
|
return ret;
|
2007-05-10 08:13:14 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
/*
|
|
|
|
* return the block group that starts at or after bytenr
|
|
|
|
*/
|
2009-01-06 10:25:51 +08:00
|
|
|
static struct btrfs_block_group_cache *
|
|
|
|
btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
|
2008-05-25 02:04:53 +08:00
|
|
|
{
|
2016-09-13 03:35:52 +08:00
|
|
|
return block_group_cache_tree_search(info, bytenr, 0);
|
2008-05-25 02:04:53 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
/*
|
2009-05-15 01:52:22 +08:00
|
|
|
* return the block group that contains the given bytenr
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
*/
|
2009-01-06 10:25:51 +08:00
|
|
|
struct btrfs_block_group_cache *btrfs_lookup_block_group(
|
|
|
|
struct btrfs_fs_info *info,
|
|
|
|
u64 bytenr)
|
2007-05-06 22:15:01 +08:00
|
|
|
{
|
2016-09-13 03:35:52 +08:00
|
|
|
return block_group_cache_tree_search(info, bytenr, 1);
|
2007-05-06 22:15:01 +08:00
|
|
|
}
|
2008-03-25 03:01:56 +08:00
|
|
|
|
2019-05-15 07:33:48 +08:00
|
|
|
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
|
2017-06-07 07:45:26 +08:00
|
|
|
{
|
2019-04-04 14:45:34 +08:00
|
|
|
if (ref->type == BTRFS_REF_METADATA) {
|
|
|
|
if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
|
2019-05-15 07:33:48 +08:00
|
|
|
return BTRFS_BLOCK_GROUP_SYSTEM;
|
2017-06-07 07:45:26 +08:00
|
|
|
else
|
2019-05-15 07:33:48 +08:00
|
|
|
return BTRFS_BLOCK_GROUP_METADATA;
|
2017-06-07 07:45:26 +08:00
|
|
|
}
|
2019-05-15 07:33:48 +08:00
|
|
|
return BTRFS_BLOCK_GROUP_DATA;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_ref *ref)
|
|
|
|
{
|
|
|
|
struct btrfs_space_info *space_info;
|
|
|
|
u64 flags = generic_ref_to_space_flags(ref);
|
|
|
|
|
2019-06-19 04:09:19 +08:00
|
|
|
space_info = btrfs_find_space_info(fs_info, flags);
|
2019-05-15 07:33:48 +08:00
|
|
|
ASSERT(space_info);
|
|
|
|
percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
|
|
|
|
BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_ref *ref)
|
|
|
|
{
|
|
|
|
struct btrfs_space_info *space_info;
|
|
|
|
u64 flags = generic_ref_to_space_flags(ref);
|
2017-06-07 07:45:26 +08:00
|
|
|
|
2019-06-19 04:09:19 +08:00
|
|
|
space_info = btrfs_find_space_info(fs_info, flags);
|
2017-06-07 07:45:27 +08:00
|
|
|
ASSERT(space_info);
|
2019-05-15 07:33:48 +08:00
|
|
|
percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
|
btrfs: use customized batch size for total_bytes_pinned
In commit b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly
pinned bytes") we use total_bytes_pinned to track how many bytes we are
going to free in this transaction. When we are close to ENOSPC, we check it
and know if we can make the allocation by commit the current transaction.
For every data/metadata extent we are going to free, we add
total_bytes_pinned in btrfs_free_extent() and btrfs_free_tree_block(), and
release it in unpin_extent_range() when we finish the transaction. So this
is a variable we frequently update but rarely read - just the suitable
use of percpu_counter. But in previous commit we update total_bytes_pinned
by default 32 batch size, making every update essentially a spin lock
protected update. Since every spin lock/unlock operation involves syncing
a globally used variable and some kind of barrier in a SMP system, this is
more expensive than using total_bytes_pinned as a simple atomic64_t.
So fix this by using a customized batch size. Since we only read
total_bytes_pinned when we are close to ENOSPC and fail to allocate new
chunk, we can use a really large batch size and have nearly no penalty
in most cases.
[Test]
We tested the patch on a 4-cores x86 machine:
1. fallocate a 16GiB size test file
2. take snapshot (so all following writes will be COW)
3. run a 180 sec, 4 jobs, 4K random write fio on test file
We also added a temporary lockdep class on percpu_counter's spin lock
used by total_bytes_pinned to track it by lock_stat.
[Results]
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 82 82
0.21 0.61 29.46 0.36 298340
635973 0.09 11.01 173476.25 0.27
patched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 1 1
0.62 0.62 0.62 0.62 13601
31542 0.14 9.61 11016.90 0.35
[Analysis]
Since the spin lock only protects a single in-memory variable, the
contentions (number of lock acquisitions that had to wait) in both
unpatched and patched version are low. But when we see acquisitions and
acq-bounces, we get much lower counts in patched version. Here the most
important metric is acq-bounces. It means how many times the lock gets
transferred between different cpus, so the patch can really reduce
cacheline bouncing of spin lock (also the global counter of percpu_counter)
in a SMP system.
Fixes: b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly pinned bytes")
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-07-13 16:50:42 +08:00
|
|
|
BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
2017-06-07 07:45:26 +08:00
|
|
|
}
|
|
|
|
|
2014-10-27 18:44:24 +08:00
|
|
|
/* simple helper to search for an existing data extent at a given offset */
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
|
2008-09-06 04:13:11 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_key key;
|
2008-09-24 01:14:14 +08:00
|
|
|
struct btrfs_path *path;
|
2008-09-06 04:13:11 +08:00
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2008-09-06 04:13:11 +08:00
|
|
|
key.objectid = start;
|
|
|
|
key.offset = len;
|
2013-03-08 03:22:04 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
|
2008-09-24 01:14:14 +08:00
|
|
|
btrfs_free_path(path);
|
2007-12-11 22:25:06 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
/*
|
2013-03-08 03:22:04 +08:00
|
|
|
* helper function to lookup reference count and flags of a tree block.
|
2010-05-16 22:48:46 +08:00
|
|
|
*
|
|
|
|
* the head node for delayed ref is used to store the sum of all the
|
|
|
|
* reference count modifications queued up in the rbtree. the head
|
|
|
|
* node may also store the extent flags to set. This way you can check
|
|
|
|
* to see what the reference count and extent flags would be if all of
|
|
|
|
* the delayed refs are not processed.
|
|
|
|
*/
|
|
|
|
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
|
2016-06-23 06:54:24 +08:00
|
|
|
struct btrfs_fs_info *fs_info, u64 bytenr,
|
2013-03-08 03:22:04 +08:00
|
|
|
u64 offset, int metadata, u64 *refs, u64 *flags)
|
2010-05-16 22:48:46 +08:00
|
|
|
{
|
|
|
|
struct btrfs_delayed_ref_head *head;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u32 item_size;
|
|
|
|
u64 num_refs;
|
|
|
|
u64 extent_flags;
|
|
|
|
int ret;
|
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
/*
|
|
|
|
* If we don't have skinny metadata, don't bother doing anything
|
|
|
|
* different
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
|
|
|
|
offset = fs_info->nodesize;
|
2013-03-08 03:22:04 +08:00
|
|
|
metadata = 0;
|
|
|
|
}
|
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (!trans) {
|
|
|
|
path->skip_locking = 1;
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
}
|
Btrfs: don't miss skinny extent items on delayed ref head contention
Currently extent-tree.c:btrfs_lookup_extent_info() can miss the lookup
of skinny extent items. This can happen when the execution flow is the
following:
* We do an extent tree lookup and fail to find a skinny extent item;
* As a result, we attempt to see if a non-skinny extent item exists,
either by looking at previous item in the leaf or by doing another
full extent tree search;
* We have a transaction and then we check for a matching delayed ref
head in the transaction's delayed refs rbtree;
* We find such delayed ref head and then we try to lock it with a
call to mutex_trylock();
* The lock was contended so we jump to the label "again", which repeats
the extent tree search but for a non-skinny extent item, because we set
previously metadata variable to 0 and the search key to look for a
non-skinny extent-item;
* After the jump (and after releasing the transaction's delayed refs
lock), a skinny extent item might have been added to the extent tree
but we will miss it because metadata is set to 0 and the search key
is set for a non-skinny extent-item.
The fix here is to not reset metadata to 0 and to jump to the initial search
key setup if the delayed ref head is contended, instead of jumping directly
to the extent tree search label ("again").
This issue was found while investigating the issue reported at Bugzilla 64961.
David Sterba suspected this function was missing extent items, and that
this could be caused by the last change to this function, which was made
in the following patch:
[PATCH] Btrfs: optimize btrfs_lookup_extent_info()
(commit 74be9510876a66ad9826613ac8a526d26f9e7f01)
But in fact this issue already existed before, because after failing to find
a skinny extent item, the code set the search key for a non-skinny extent
item, and on contention of a matching delayed ref head it would not search
the extent tree for a skinny extent item anymore.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2013-12-08 08:26:29 +08:00
|
|
|
|
|
|
|
search_again:
|
|
|
|
key.objectid = bytenr;
|
|
|
|
key.offset = offset;
|
|
|
|
if (metadata)
|
|
|
|
key.type = BTRFS_METADATA_ITEM_KEY;
|
|
|
|
else
|
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_free;
|
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
|
2013-07-06 06:12:06 +08:00
|
|
|
if (path->slots[0]) {
|
|
|
|
path->slots[0]--;
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
|
|
|
path->slots[0]);
|
|
|
|
if (key.objectid == bytenr &&
|
|
|
|
key.type == BTRFS_EXTENT_ITEM_KEY &&
|
2016-06-23 06:54:23 +08:00
|
|
|
key.offset == fs_info->nodesize)
|
2013-07-06 06:12:06 +08:00
|
|
|
ret = 0;
|
|
|
|
}
|
2013-03-08 03:22:04 +08:00
|
|
|
}
|
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
if (ret == 0) {
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
|
|
|
if (item_size >= sizeof(*ei)) {
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_item);
|
|
|
|
num_refs = btrfs_extent_refs(leaf, ei);
|
|
|
|
extent_flags = btrfs_extent_flags(leaf, ei);
|
|
|
|
} else {
|
2018-06-26 21:57:36 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
btrfs_print_v0_err(fs_info);
|
|
|
|
if (trans)
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
else
|
|
|
|
btrfs_handle_fs_error(fs_info, ret, NULL);
|
|
|
|
|
|
|
|
goto out_free;
|
2010-05-16 22:48:46 +08:00
|
|
|
}
|
2018-06-26 21:57:36 +08:00
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
BUG_ON(num_refs == 0);
|
|
|
|
} else {
|
|
|
|
num_refs = 0;
|
|
|
|
extent_flags = 0;
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!trans)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
|
|
spin_lock(&delayed_refs->lock);
|
2017-01-31 04:24:37 +08:00
|
|
|
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (head) {
|
|
|
|
if (!mutex_trylock(&head->mutex)) {
|
2017-09-30 03:43:57 +08:00
|
|
|
refcount_inc(&head->refs);
|
2010-05-16 22:48:46 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2010-05-16 22:48:46 +08:00
|
|
|
|
2011-05-02 21:29:25 +08:00
|
|
|
/*
|
|
|
|
* Mutex was contended, block until it's released and try
|
|
|
|
* again
|
|
|
|
*/
|
2010-05-16 22:48:46 +08:00
|
|
|
mutex_lock(&head->mutex);
|
|
|
|
mutex_unlock(&head->mutex);
|
2017-09-30 03:43:57 +08:00
|
|
|
btrfs_put_delayed_ref_head(head);
|
Btrfs: don't miss skinny extent items on delayed ref head contention
Currently extent-tree.c:btrfs_lookup_extent_info() can miss the lookup
of skinny extent items. This can happen when the execution flow is the
following:
* We do an extent tree lookup and fail to find a skinny extent item;
* As a result, we attempt to see if a non-skinny extent item exists,
either by looking at previous item in the leaf or by doing another
full extent tree search;
* We have a transaction and then we check for a matching delayed ref
head in the transaction's delayed refs rbtree;
* We find such delayed ref head and then we try to lock it with a
call to mutex_trylock();
* The lock was contended so we jump to the label "again", which repeats
the extent tree search but for a non-skinny extent item, because we set
previously metadata variable to 0 and the search key to look for a
non-skinny extent-item;
* After the jump (and after releasing the transaction's delayed refs
lock), a skinny extent item might have been added to the extent tree
but we will miss it because metadata is set to 0 and the search key
is set for a non-skinny extent-item.
The fix here is to not reset metadata to 0 and to jump to the initial search
key setup if the delayed ref head is contended, instead of jumping directly
to the extent tree search label ("again").
This issue was found while investigating the issue reported at Bugzilla 64961.
David Sterba suspected this function was missing extent items, and that
this could be caused by the last change to this function, which was made
in the following patch:
[PATCH] Btrfs: optimize btrfs_lookup_extent_info()
(commit 74be9510876a66ad9826613ac8a526d26f9e7f01)
But in fact this issue already existed before, because after failing to find
a skinny extent item, the code set the search key for a non-skinny extent
item, and on contention of a matching delayed ref head it would not search
the extent tree for a skinny extent item anymore.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2013-12-08 08:26:29 +08:00
|
|
|
goto search_again;
|
2010-05-16 22:48:46 +08:00
|
|
|
}
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_lock(&head->lock);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (head->extent_op && head->extent_op->update_flags)
|
|
|
|
extent_flags |= head->extent_op->flags_to_set;
|
|
|
|
else
|
|
|
|
BUG_ON(num_refs == 0);
|
|
|
|
|
2017-09-30 03:43:57 +08:00
|
|
|
num_refs += head->ref_mod;
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_unlock(&head->lock);
|
2010-05-16 22:48:46 +08:00
|
|
|
mutex_unlock(&head->mutex);
|
|
|
|
}
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
out:
|
|
|
|
WARN_ON(num_refs == 0);
|
|
|
|
if (refs)
|
|
|
|
*refs = num_refs;
|
|
|
|
if (flags)
|
|
|
|
*flags = extent_flags;
|
|
|
|
out_free:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-12-12 01:42:00 +08:00
|
|
|
/*
|
|
|
|
* Back reference rules. Back refs have three main goals:
|
|
|
|
*
|
|
|
|
* 1) differentiate between all holders of references to an extent so that
|
|
|
|
* when a reference is dropped we can make sure it was a valid reference
|
|
|
|
* before freeing the extent.
|
|
|
|
*
|
|
|
|
* 2) Provide enough information to quickly find the holders of an extent
|
|
|
|
* if we notice a given block is corrupted or bad.
|
|
|
|
*
|
|
|
|
* 3) Make it easy to migrate blocks for FS shrinking or storage pool
|
|
|
|
* maintenance. This is actually the same as #2, but with a slightly
|
|
|
|
* different use case.
|
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* There are two kinds of back refs. The implicit back refs is optimized
|
|
|
|
* for pointers in non-shared tree blocks. For a given pointer in a block,
|
|
|
|
* back refs of this kind provide information about the block's owner tree
|
|
|
|
* and the pointer's key. These information allow us to find the block by
|
|
|
|
* b-tree searching. The full back refs is for pointers in tree blocks not
|
|
|
|
* referenced by their owner trees. The location of tree block is recorded
|
|
|
|
* in the back refs. Actually the full back refs is generic, and can be
|
|
|
|
* used in all cases the implicit back refs is used. The major shortcoming
|
|
|
|
* of the full back refs is its overhead. Every time a tree block gets
|
|
|
|
* COWed, we have to update back refs entry for all pointers in it.
|
|
|
|
*
|
|
|
|
* For a newly allocated tree block, we use implicit back refs for
|
|
|
|
* pointers in it. This means most tree related operations only involve
|
|
|
|
* implicit back refs. For a tree block created in old transaction, the
|
|
|
|
* only way to drop a reference to it is COW it. So we can detect the
|
|
|
|
* event that tree block loses its owner tree's reference and do the
|
|
|
|
* back refs conversion.
|
|
|
|
*
|
2016-05-20 09:18:45 +08:00
|
|
|
* When a tree block is COWed through a tree, there are four cases:
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
*
|
|
|
|
* The reference count of the block is one and the tree is the block's
|
|
|
|
* owner tree. Nothing to do in this case.
|
|
|
|
*
|
|
|
|
* The reference count of the block is one and the tree is not the
|
|
|
|
* block's owner tree. In this case, full back refs is used for pointers
|
|
|
|
* in the block. Remove these full back refs, add implicit back refs for
|
|
|
|
* every pointers in the new block.
|
|
|
|
*
|
|
|
|
* The reference count of the block is greater than one and the tree is
|
|
|
|
* the block's owner tree. In this case, implicit back refs is used for
|
|
|
|
* pointers in the block. Add full back refs for every pointers in the
|
|
|
|
* block, increase lower level extents' reference counts. The original
|
|
|
|
* implicit back refs are entailed to the new block.
|
|
|
|
*
|
|
|
|
* The reference count of the block is greater than one and the tree is
|
|
|
|
* not the block's owner tree. Add implicit back refs for every pointer in
|
|
|
|
* the new block, increase lower level extents' reference count.
|
|
|
|
*
|
|
|
|
* Back Reference Key composing:
|
|
|
|
*
|
|
|
|
* The key objectid corresponds to the first byte in the extent,
|
|
|
|
* The key type is used to differentiate between types of back refs.
|
|
|
|
* There are different meanings of the key offset for different types
|
|
|
|
* of back refs.
|
|
|
|
*
|
2007-12-12 01:42:00 +08:00
|
|
|
* File extents can be referenced by:
|
|
|
|
*
|
|
|
|
* - multiple snapshots, subvolumes, or different generations in one subvol
|
2008-09-24 01:14:14 +08:00
|
|
|
* - different files inside a single subvolume
|
2007-12-12 01:42:00 +08:00
|
|
|
* - different offsets inside a file (bookend extents in file.c)
|
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* The extent ref structure for the implicit back refs has fields for:
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
|
|
|
* - Objectid of the subvolume root
|
|
|
|
* - objectid of the file holding the reference
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* - original offset in the file
|
|
|
|
* - how many bookend extents
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* The key offset for the implicit back refs is hash of the first
|
|
|
|
* three fields.
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* The extent ref structure for the full back refs has field for:
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* - number of pointers in the tree leaf
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* The key offset for the implicit back refs is the first byte of
|
|
|
|
* the tree leaf
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* When a file extent is allocated, The implicit back refs is used.
|
|
|
|
* the fields are filled in:
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* (root_key.objectid, inode objectid, offset in file, 1)
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* When a file extent is removed file truncation, we find the
|
|
|
|
* corresponding implicit back refs and check the following fields:
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* (btrfs_header_owner(leaf), inode objectid, offset in file)
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* Btree extents can be referenced by:
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* - Different subvolumes
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* Both the implicit back refs and the full back refs for tree blocks
|
|
|
|
* only consist of key. The key offset for the implicit back refs is
|
|
|
|
* objectid of block's owner tree. The key offset for the full back refs
|
|
|
|
* is the first byte of parent block.
|
2007-12-12 01:42:00 +08:00
|
|
|
*
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
* When implicit back refs is used, information about the lowest key and
|
|
|
|
* level of the tree block are required. These information are stored in
|
|
|
|
* tree block info structure.
|
2007-12-12 01:42:00 +08:00
|
|
|
*/
|
2008-09-24 01:14:14 +08:00
|
|
|
|
2017-08-19 05:15:18 +08:00
|
|
|
/*
|
|
|
|
* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
|
2018-11-28 19:05:13 +08:00
|
|
|
* is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
|
2017-08-19 05:15:18 +08:00
|
|
|
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
|
|
|
|
*/
|
|
|
|
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
|
|
|
|
struct btrfs_extent_inline_ref *iref,
|
|
|
|
enum btrfs_inline_ref_type is_data)
|
|
|
|
{
|
|
|
|
int type = btrfs_extent_inline_ref_type(eb, iref);
|
2017-08-19 05:15:24 +08:00
|
|
|
u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
|
2017-08-19 05:15:18 +08:00
|
|
|
|
|
|
|
if (type == BTRFS_TREE_BLOCK_REF_KEY ||
|
|
|
|
type == BTRFS_SHARED_BLOCK_REF_KEY ||
|
|
|
|
type == BTRFS_SHARED_DATA_REF_KEY ||
|
|
|
|
type == BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
if (is_data == BTRFS_REF_TYPE_BLOCK) {
|
2017-08-19 05:15:24 +08:00
|
|
|
if (type == BTRFS_TREE_BLOCK_REF_KEY)
|
2017-08-19 05:15:18 +08:00
|
|
|
return type;
|
2017-08-19 05:15:24 +08:00
|
|
|
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
|
|
|
|
ASSERT(eb->fs_info);
|
|
|
|
/*
|
|
|
|
* Every shared one has parent tree
|
|
|
|
* block, which must be aligned to
|
|
|
|
* nodesize.
|
|
|
|
*/
|
|
|
|
if (offset &&
|
|
|
|
IS_ALIGNED(offset, eb->fs_info->nodesize))
|
|
|
|
return type;
|
|
|
|
}
|
2017-08-19 05:15:18 +08:00
|
|
|
} else if (is_data == BTRFS_REF_TYPE_DATA) {
|
2017-08-19 05:15:24 +08:00
|
|
|
if (type == BTRFS_EXTENT_DATA_REF_KEY)
|
2017-08-19 05:15:18 +08:00
|
|
|
return type;
|
2017-08-19 05:15:24 +08:00
|
|
|
if (type == BTRFS_SHARED_DATA_REF_KEY) {
|
|
|
|
ASSERT(eb->fs_info);
|
|
|
|
/*
|
|
|
|
* Every shared one has parent tree
|
|
|
|
* block, which must be aligned to
|
|
|
|
* nodesize.
|
|
|
|
*/
|
|
|
|
if (offset &&
|
|
|
|
IS_ALIGNED(offset, eb->fs_info->nodesize))
|
|
|
|
return type;
|
|
|
|
}
|
2017-08-19 05:15:18 +08:00
|
|
|
} else {
|
|
|
|
ASSERT(is_data == BTRFS_REF_TYPE_ANY);
|
|
|
|
return type;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_print_leaf((struct extent_buffer *)eb);
|
|
|
|
btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
|
|
|
|
eb->start, type);
|
|
|
|
WARN_ON(1);
|
|
|
|
|
|
|
|
return BTRFS_REF_TYPE_INVALID;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
|
|
|
|
{
|
|
|
|
u32 high_crc = ~(u32)0;
|
|
|
|
u32 low_crc = ~(u32)0;
|
|
|
|
__le64 lenum;
|
|
|
|
|
|
|
|
lenum = cpu_to_le64(root_objectid);
|
2019-05-22 16:18:59 +08:00
|
|
|
high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
lenum = cpu_to_le64(owner);
|
2019-05-22 16:18:59 +08:00
|
|
|
low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
lenum = cpu_to_le64(offset);
|
2019-05-22 16:18:59 +08:00
|
|
|
low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
return ((u64)high_crc << 31) ^ (u64)low_crc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
|
|
|
|
struct btrfs_extent_data_ref *ref)
|
|
|
|
{
|
|
|
|
return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
|
|
|
|
btrfs_extent_data_ref_objectid(leaf, ref),
|
|
|
|
btrfs_extent_data_ref_offset(leaf, ref));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int match_extent_data_ref(struct extent_buffer *leaf,
|
|
|
|
struct btrfs_extent_data_ref *ref,
|
|
|
|
u64 root_objectid, u64 owner, u64 offset)
|
|
|
|
{
|
|
|
|
if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
|
|
|
|
btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
|
|
|
|
btrfs_extent_data_ref_offset(leaf, ref) != offset)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 bytenr, u64 parent,
|
|
|
|
u64 root_objectid,
|
|
|
|
u64 owner, u64 offset)
|
|
|
|
{
|
2018-06-20 20:48:51 +08:00
|
|
|
struct btrfs_root *root = trans->fs_info->extent_root;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_extent_data_ref *ref;
|
2008-09-24 01:14:14 +08:00
|
|
|
struct extent_buffer *leaf;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u32 nritems;
|
2007-12-11 22:25:06 +08:00
|
|
|
int ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int recow;
|
|
|
|
int err = -ENOENT;
|
2007-12-11 22:25:06 +08:00
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
key.objectid = bytenr;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (parent) {
|
|
|
|
key.type = BTRFS_SHARED_DATA_REF_KEY;
|
|
|
|
key.offset = parent;
|
|
|
|
} else {
|
|
|
|
key.type = BTRFS_EXTENT_DATA_REF_KEY;
|
|
|
|
key.offset = hash_extent_data_ref(root_objectid,
|
|
|
|
owner, offset);
|
|
|
|
}
|
|
|
|
again:
|
|
|
|
recow = 0;
|
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto fail;
|
|
|
|
}
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (parent) {
|
|
|
|
if (!ret)
|
|
|
|
return 0;
|
|
|
|
goto fail;
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
nritems = btrfs_header_nritems(leaf);
|
|
|
|
while (1) {
|
|
|
|
if (path->slots[0] >= nritems) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
err = ret;
|
|
|
|
if (ret)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
nritems = btrfs_header_nritems(leaf);
|
|
|
|
recow = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
|
|
|
if (key.objectid != bytenr ||
|
|
|
|
key.type != BTRFS_EXTENT_DATA_REF_KEY)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_data_ref);
|
|
|
|
|
|
|
|
if (match_extent_data_ref(leaf, ref, root_objectid,
|
|
|
|
owner, offset)) {
|
|
|
|
if (recow) {
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
err = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
path->slots[0]++;
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
fail:
|
|
|
|
return err;
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 bytenr, u64 parent,
|
|
|
|
u64 root_objectid, u64 owner,
|
|
|
|
u64 offset, int refs_to_add)
|
2008-09-24 01:14:14 +08:00
|
|
|
{
|
2018-06-20 20:48:44 +08:00
|
|
|
struct btrfs_root *root = trans->fs_info->extent_root;
|
2008-09-24 01:14:14 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *leaf;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u32 size;
|
2008-09-24 01:14:14 +08:00
|
|
|
u32 num_refs;
|
|
|
|
int ret;
|
2007-12-11 22:25:06 +08:00
|
|
|
|
|
|
|
key.objectid = bytenr;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (parent) {
|
|
|
|
key.type = BTRFS_SHARED_DATA_REF_KEY;
|
|
|
|
key.offset = parent;
|
|
|
|
size = sizeof(struct btrfs_shared_data_ref);
|
|
|
|
} else {
|
|
|
|
key.type = BTRFS_EXTENT_DATA_REF_KEY;
|
|
|
|
key.offset = hash_extent_data_ref(root_objectid,
|
|
|
|
owner, offset);
|
|
|
|
size = sizeof(struct btrfs_extent_data_ref);
|
|
|
|
}
|
2007-12-11 22:25:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key, size);
|
|
|
|
if (ret && ret != -EEXIST)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
if (parent) {
|
|
|
|
struct btrfs_shared_data_ref *ref;
|
2008-09-24 01:14:14 +08:00
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0],
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_shared_data_ref);
|
|
|
|
if (ret == 0) {
|
|
|
|
btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
|
|
|
|
} else {
|
|
|
|
num_refs = btrfs_shared_data_ref_count(leaf, ref);
|
|
|
|
num_refs += refs_to_add;
|
|
|
|
btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
|
|
|
struct btrfs_extent_data_ref *ref;
|
|
|
|
while (ret == -EEXIST) {
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_data_ref);
|
|
|
|
if (match_extent_data_ref(leaf, ref, root_objectid,
|
|
|
|
owner, offset))
|
|
|
|
break;
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
key.offset++;
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
size);
|
|
|
|
if (ret && ret != -EEXIST)
|
|
|
|
goto fail;
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
}
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_data_ref);
|
|
|
|
if (ret == 0) {
|
|
|
|
btrfs_set_extent_data_ref_root(leaf, ref,
|
|
|
|
root_objectid);
|
|
|
|
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
|
|
|
|
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
|
|
|
|
btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
|
|
|
|
} else {
|
|
|
|
num_refs = btrfs_extent_data_ref_count(leaf, ref);
|
|
|
|
num_refs += refs_to_add;
|
|
|
|
btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
ret = 0;
|
|
|
|
fail:
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2007-12-11 22:25:06 +08:00
|
|
|
return ret;
|
2007-12-11 22:25:06 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
2014-05-14 08:30:47 +08:00
|
|
|
int refs_to_drop, int *last_ref)
|
2008-09-24 01:14:14 +08:00
|
|
|
{
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_extent_data_ref *ref1 = NULL;
|
|
|
|
struct btrfs_shared_data_ref *ref2 = NULL;
|
2008-09-24 01:14:14 +08:00
|
|
|
struct extent_buffer *leaf;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u32 num_refs = 0;
|
2008-09-24 01:14:14 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
|
|
|
|
|
|
|
if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
ref1 = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_data_ref);
|
|
|
|
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
|
|
|
|
} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
|
|
|
|
ref2 = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_shared_data_ref);
|
|
|
|
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
|
2018-06-26 22:20:59 +08:00
|
|
|
} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
|
2018-06-26 21:57:36 +08:00
|
|
|
btrfs_print_v0_err(trans->fs_info);
|
|
|
|
btrfs_abort_transaction(trans, -EINVAL);
|
|
|
|
return -EINVAL;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2009-03-13 22:10:06 +08:00
|
|
|
BUG_ON(num_refs < refs_to_drop);
|
|
|
|
num_refs -= refs_to_drop;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
if (num_refs == 0) {
|
2018-06-20 20:48:46 +08:00
|
|
|
ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
|
2014-05-14 08:30:47 +08:00
|
|
|
*last_ref = 1;
|
2008-09-24 01:14:14 +08:00
|
|
|
} else {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
|
|
|
|
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
|
|
|
|
else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
|
|
|
|
btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
|
2008-09-24 01:14:14 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-08-06 22:16:24 +08:00
|
|
|
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_extent_inline_ref *iref)
|
2008-11-20 10:17:22 +08:00
|
|
|
{
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_extent_data_ref *ref1;
|
|
|
|
struct btrfs_shared_data_ref *ref2;
|
|
|
|
u32 num_refs = 0;
|
2017-08-19 05:15:19 +08:00
|
|
|
int type;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
2018-06-26 21:57:36 +08:00
|
|
|
|
|
|
|
BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (iref) {
|
2017-08-19 05:15:19 +08:00
|
|
|
/*
|
|
|
|
* If type is invalid, we should have bailed out earlier than
|
|
|
|
* this call.
|
|
|
|
*/
|
|
|
|
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
|
|
|
|
ASSERT(type != BTRFS_REF_TYPE_INVALID);
|
|
|
|
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
|
|
|
|
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
|
|
|
|
} else {
|
|
|
|
ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
|
|
|
|
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
|
|
|
|
}
|
|
|
|
} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
ref1 = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_data_ref);
|
|
|
|
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
|
|
|
|
} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
|
|
|
|
ref2 = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_shared_data_ref);
|
|
|
|
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
|
|
|
|
} else {
|
|
|
|
WARN_ON(1);
|
|
|
|
}
|
|
|
|
return num_refs;
|
|
|
|
}
|
2008-11-20 10:17:22 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 bytenr, u64 parent,
|
|
|
|
u64 root_objectid)
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
{
|
2018-06-20 20:48:50 +08:00
|
|
|
struct btrfs_root *root = trans->fs_info->extent_root;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
int ret;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
key.objectid = bytenr;
|
|
|
|
if (parent) {
|
|
|
|
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
|
|
|
|
key.offset = parent;
|
|
|
|
} else {
|
|
|
|
key.type = BTRFS_TREE_BLOCK_REF_KEY;
|
|
|
|
key.offset = root_objectid;
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
return ret;
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 bytenr, u64 parent,
|
|
|
|
u64 root_objectid)
|
2008-09-24 01:14:14 +08:00
|
|
|
{
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
2008-09-24 01:14:14 +08:00
|
|
|
int ret;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
key.objectid = bytenr;
|
|
|
|
if (parent) {
|
|
|
|
key.type = BTRFS_SHARED_BLOCK_REF_KEY;
|
|
|
|
key.offset = parent;
|
|
|
|
} else {
|
|
|
|
key.type = BTRFS_TREE_BLOCK_REF_KEY;
|
|
|
|
key.offset = root_objectid;
|
|
|
|
}
|
|
|
|
|
2018-06-20 20:48:43 +08:00
|
|
|
ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
|
2017-02-16 05:28:27 +08:00
|
|
|
path, &key, 0);
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2008-09-24 01:14:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static inline int extent_ref_type(u64 parent, u64 owner)
|
2008-09-24 01:14:14 +08:00
|
|
|
{
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int type;
|
|
|
|
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
if (parent > 0)
|
|
|
|
type = BTRFS_SHARED_BLOCK_REF_KEY;
|
|
|
|
else
|
|
|
|
type = BTRFS_TREE_BLOCK_REF_KEY;
|
|
|
|
} else {
|
|
|
|
if (parent > 0)
|
|
|
|
type = BTRFS_SHARED_DATA_REF_KEY;
|
|
|
|
else
|
|
|
|
type = BTRFS_EXTENT_DATA_REF_KEY;
|
|
|
|
}
|
|
|
|
return type;
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
2009-03-13 22:10:06 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
static int find_next_key(struct btrfs_path *path, int level,
|
|
|
|
struct btrfs_key *key)
|
2009-03-13 22:10:06 +08:00
|
|
|
|
2007-03-03 05:08:05 +08:00
|
|
|
{
|
2009-06-28 09:07:35 +08:00
|
|
|
for (; level < BTRFS_MAX_LEVEL; level++) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (!path->nodes[level])
|
|
|
|
break;
|
|
|
|
if (path->slots[level] + 1 >=
|
|
|
|
btrfs_header_nritems(path->nodes[level]))
|
|
|
|
continue;
|
|
|
|
if (level == 0)
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[level], key,
|
|
|
|
path->slots[level] + 1);
|
|
|
|
else
|
|
|
|
btrfs_node_key_to_cpu(path->nodes[level], key,
|
|
|
|
path->slots[level] + 1);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
2007-03-08 00:50:24 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
/*
|
|
|
|
* look for inline back ref. if back ref is found, *ref_ret is set
|
|
|
|
* to the address of inline back ref, and 0 is returned.
|
|
|
|
*
|
|
|
|
* if back ref isn't found, *ref_ret is set to the address where it
|
|
|
|
* should be inserted, and -ENOENT is returned.
|
|
|
|
*
|
|
|
|
* if insert is true and there are too many inline back refs, the path
|
|
|
|
* points to the extent item, and -EAGAIN is returned.
|
|
|
|
*
|
|
|
|
* NOTE: inline back refs are ordered in the same way that back ref
|
|
|
|
* items in the tree are ordered.
|
|
|
|
*/
|
|
|
|
static noinline_for_stack
|
|
|
|
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_extent_inline_ref **ref_ret,
|
|
|
|
u64 bytenr, u64 num_bytes,
|
|
|
|
u64 parent, u64 root_objectid,
|
|
|
|
u64 owner, u64 offset, int insert)
|
|
|
|
{
|
2018-06-20 20:48:48 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2017-02-16 05:28:27 +08:00
|
|
|
struct btrfs_root *root = fs_info->extent_root;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct btrfs_extent_inline_ref *iref;
|
|
|
|
u64 flags;
|
|
|
|
u64 item_size;
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long end;
|
|
|
|
int extra_size;
|
|
|
|
int type;
|
|
|
|
int want;
|
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2016-06-23 06:54:23 +08:00
|
|
|
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
|
2017-08-19 05:15:19 +08:00
|
|
|
int needed;
|
2007-08-09 08:17:12 +08:00
|
|
|
|
2007-10-16 04:15:53 +08:00
|
|
|
key.objectid = bytenr;
|
2008-09-24 01:14:14 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
2009-03-13 22:10:06 +08:00
|
|
|
key.offset = num_bytes;
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
want = extent_ref_type(parent, owner);
|
|
|
|
if (insert) {
|
|
|
|
extra_size = btrfs_extent_inline_ref_size(want);
|
2009-06-11 20:51:10 +08:00
|
|
|
path->keep_locks = 1;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else
|
|
|
|
extra_size = -1;
|
2013-03-08 03:22:04 +08:00
|
|
|
|
|
|
|
/*
|
2018-06-18 19:59:26 +08:00
|
|
|
* Owner is our level, so we can just add one to get the level for the
|
|
|
|
* block we are interested in.
|
2013-03-08 03:22:04 +08:00
|
|
|
*/
|
|
|
|
if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
key.type = BTRFS_METADATA_ITEM_KEY;
|
|
|
|
key.offset = owner;
|
|
|
|
}
|
|
|
|
|
|
|
|
again:
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
|
2009-03-13 23:00:37 +08:00
|
|
|
if (ret < 0) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-03-08 03:22:04 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We may be a newly converted file system which still has the old fat
|
|
|
|
* extent entries for metadata, so try and see if we have one of those.
|
|
|
|
*/
|
|
|
|
if (ret > 0 && skinny_metadata) {
|
|
|
|
skinny_metadata = false;
|
|
|
|
if (path->slots[0]) {
|
|
|
|
path->slots[0]--;
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
|
|
|
path->slots[0]);
|
|
|
|
if (key.objectid == bytenr &&
|
|
|
|
key.type == BTRFS_EXTENT_ITEM_KEY &&
|
|
|
|
key.offset == num_bytes)
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
if (ret) {
|
2014-04-24 22:15:28 +08:00
|
|
|
key.objectid = bytenr;
|
2013-03-08 03:22:04 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
key.offset = num_bytes;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret && !insert) {
|
|
|
|
err = -ENOENT;
|
|
|
|
goto out;
|
2013-10-31 13:00:08 +08:00
|
|
|
} else if (WARN_ON(ret)) {
|
2013-03-09 04:41:02 +08:00
|
|
|
err = -EIO;
|
|
|
|
goto out;
|
2012-03-12 23:03:00 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
2018-06-26 22:20:59 +08:00
|
|
|
if (unlikely(item_size < sizeof(*ei))) {
|
2018-06-26 21:57:36 +08:00
|
|
|
err = -EINVAL;
|
|
|
|
btrfs_print_v0_err(fs_info);
|
|
|
|
btrfs_abort_transaction(trans, err);
|
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
|
|
|
flags = btrfs_extent_flags(leaf, ei);
|
|
|
|
|
|
|
|
ptr = (unsigned long)(ei + 1);
|
|
|
|
end = (unsigned long)ei + item_size;
|
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ptr += sizeof(struct btrfs_tree_block_info);
|
|
|
|
BUG_ON(ptr > end);
|
|
|
|
}
|
|
|
|
|
2017-08-19 05:15:19 +08:00
|
|
|
if (owner >= BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
needed = BTRFS_REF_TYPE_DATA;
|
|
|
|
else
|
|
|
|
needed = BTRFS_REF_TYPE_BLOCK;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
err = -ENOENT;
|
|
|
|
while (1) {
|
|
|
|
if (ptr >= end) {
|
|
|
|
WARN_ON(ptr > end);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
iref = (struct btrfs_extent_inline_ref *)ptr;
|
2017-08-19 05:15:19 +08:00
|
|
|
type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
|
|
|
|
if (type == BTRFS_REF_TYPE_INVALID) {
|
2018-06-22 16:18:01 +08:00
|
|
|
err = -EUCLEAN;
|
2017-08-19 05:15:19 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (want < type)
|
|
|
|
break;
|
|
|
|
if (want > type) {
|
|
|
|
ptr += btrfs_extent_inline_ref_size(type);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
struct btrfs_extent_data_ref *dref;
|
|
|
|
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
|
|
|
|
if (match_extent_data_ref(leaf, dref, root_objectid,
|
|
|
|
owner, offset)) {
|
|
|
|
err = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (hash_extent_data_ref_item(leaf, dref) <
|
|
|
|
hash_extent_data_ref(root_objectid, owner, offset))
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
u64 ref_offset;
|
|
|
|
ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
|
|
|
|
if (parent > 0) {
|
|
|
|
if (parent == ref_offset) {
|
|
|
|
err = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (ref_offset < parent)
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
if (root_objectid == ref_offset) {
|
|
|
|
err = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (ref_offset < root_objectid)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ptr += btrfs_extent_inline_ref_size(type);
|
|
|
|
}
|
|
|
|
if (err == -ENOENT && insert) {
|
|
|
|
if (item_size + extra_size >=
|
|
|
|
BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
|
|
|
|
err = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* To add new inline back ref, we have to make sure
|
|
|
|
* there is no corresponding back ref item.
|
|
|
|
* For simplicity, we just do not add new inline back
|
|
|
|
* ref if there is any kind of item for this block
|
|
|
|
*/
|
2009-06-28 09:07:35 +08:00
|
|
|
if (find_next_key(path, 0, &key) == 0 &&
|
|
|
|
key.objectid == bytenr &&
|
2009-06-11 20:51:10 +08:00
|
|
|
key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
err = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
|
|
|
|
out:
|
2009-06-11 20:51:10 +08:00
|
|
|
if (insert) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path->keep_locks = 0;
|
|
|
|
btrfs_unlock_up_safe(path, 1);
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* helper to add new inline back ref
|
|
|
|
*/
|
|
|
|
static noinline_for_stack
|
2017-02-16 05:28:27 +08:00
|
|
|
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
|
2012-03-01 21:56:26 +08:00
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_extent_inline_ref *iref,
|
|
|
|
u64 parent, u64 root_objectid,
|
|
|
|
u64 owner, u64 offset, int refs_to_add,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long end;
|
|
|
|
unsigned long item_offset;
|
|
|
|
u64 refs;
|
|
|
|
int size;
|
|
|
|
int type;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
|
|
|
item_offset = (unsigned long)iref - (unsigned long)ei;
|
|
|
|
|
|
|
|
type = extent_ref_type(parent, owner);
|
|
|
|
size = btrfs_extent_inline_ref_size(type);
|
|
|
|
|
2019-03-20 21:51:10 +08:00
|
|
|
btrfs_extend_item(path, size);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
|
|
|
refs = btrfs_extent_refs(leaf, ei);
|
|
|
|
refs += refs_to_add;
|
|
|
|
btrfs_set_extent_refs(leaf, ei, refs);
|
|
|
|
if (extent_op)
|
|
|
|
__run_delayed_extent_op(extent_op, leaf, ei);
|
|
|
|
|
|
|
|
ptr = (unsigned long)ei + item_offset;
|
|
|
|
end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
|
|
|
|
if (ptr < end - size)
|
|
|
|
memmove_extent_buffer(leaf, ptr + size, ptr,
|
|
|
|
end - size - ptr);
|
|
|
|
|
|
|
|
iref = (struct btrfs_extent_inline_ref *)ptr;
|
|
|
|
btrfs_set_extent_inline_ref_type(leaf, iref, type);
|
|
|
|
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
struct btrfs_extent_data_ref *dref;
|
|
|
|
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
|
|
|
|
btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
|
|
|
|
btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
|
|
|
|
btrfs_set_extent_data_ref_offset(leaf, dref, offset);
|
|
|
|
btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
|
|
|
|
} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
|
|
|
|
struct btrfs_shared_data_ref *sref;
|
|
|
|
sref = (struct btrfs_shared_data_ref *)(iref + 1);
|
|
|
|
btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
|
|
|
|
btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
|
|
|
|
} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
|
|
|
|
btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
|
|
|
|
} else {
|
|
|
|
btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
|
|
|
|
}
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int lookup_extent_backref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_extent_inline_ref **ref_ret,
|
|
|
|
u64 bytenr, u64 num_bytes, u64 parent,
|
|
|
|
u64 root_objectid, u64 owner, u64 offset)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2018-06-20 20:48:48 +08:00
|
|
|
ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
|
|
|
|
num_bytes, parent, root_objectid,
|
|
|
|
owner, offset, 0);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (ret != -ENOENT)
|
2007-06-23 02:16:25 +08:00
|
|
|
return ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
*ref_ret = NULL;
|
|
|
|
|
|
|
|
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
|
2018-06-20 20:48:50 +08:00
|
|
|
ret = lookup_tree_block_ref(trans, path, bytenr, parent,
|
|
|
|
root_objectid);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
2018-06-20 20:48:51 +08:00
|
|
|
ret = lookup_extent_data_ref(trans, path, bytenr, parent,
|
|
|
|
root_objectid, owner, offset);
|
2009-03-13 23:00:37 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
/*
|
|
|
|
* helper to update/remove inline back ref
|
|
|
|
*/
|
|
|
|
static noinline_for_stack
|
2018-06-20 20:48:49 +08:00
|
|
|
void update_inline_extent_backref(struct btrfs_path *path,
|
2012-03-01 21:56:26 +08:00
|
|
|
struct btrfs_extent_inline_ref *iref,
|
|
|
|
int refs_to_mod,
|
2014-05-14 08:30:47 +08:00
|
|
|
struct btrfs_delayed_extent_op *extent_op,
|
|
|
|
int *last_ref)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2018-06-20 20:48:49 +08:00
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct btrfs_extent_data_ref *dref = NULL;
|
|
|
|
struct btrfs_shared_data_ref *sref = NULL;
|
|
|
|
unsigned long ptr;
|
|
|
|
unsigned long end;
|
|
|
|
u32 item_size;
|
|
|
|
int size;
|
|
|
|
int type;
|
|
|
|
u64 refs;
|
|
|
|
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
|
|
|
refs = btrfs_extent_refs(leaf, ei);
|
|
|
|
WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
|
|
|
|
refs += refs_to_mod;
|
|
|
|
btrfs_set_extent_refs(leaf, ei, refs);
|
|
|
|
if (extent_op)
|
|
|
|
__run_delayed_extent_op(extent_op, leaf, ei);
|
|
|
|
|
2017-08-19 05:15:19 +08:00
|
|
|
/*
|
|
|
|
* If type is invalid, we should have bailed out after
|
|
|
|
* lookup_inline_extent_backref().
|
|
|
|
*/
|
|
|
|
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
|
|
|
|
ASSERT(type != BTRFS_REF_TYPE_INVALID);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
|
|
|
|
refs = btrfs_extent_data_ref_count(leaf, dref);
|
|
|
|
} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
|
|
|
|
sref = (struct btrfs_shared_data_ref *)(iref + 1);
|
|
|
|
refs = btrfs_shared_data_ref_count(leaf, sref);
|
|
|
|
} else {
|
|
|
|
refs = 1;
|
|
|
|
BUG_ON(refs_to_mod != -1);
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
|
|
|
|
refs += refs_to_mod;
|
|
|
|
|
|
|
|
if (refs > 0) {
|
|
|
|
if (type == BTRFS_EXTENT_DATA_REF_KEY)
|
|
|
|
btrfs_set_extent_data_ref_count(leaf, dref, refs);
|
|
|
|
else
|
|
|
|
btrfs_set_shared_data_ref_count(leaf, sref, refs);
|
|
|
|
} else {
|
2014-05-14 08:30:47 +08:00
|
|
|
*last_ref = 1;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
size = btrfs_extent_inline_ref_size(type);
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
|
|
|
ptr = (unsigned long)iref;
|
|
|
|
end = (unsigned long)ei + item_size;
|
|
|
|
if (ptr + size < end)
|
|
|
|
memmove_extent_buffer(leaf, ptr, ptr + size,
|
|
|
|
end - ptr - size);
|
|
|
|
item_size -= size;
|
2019-03-20 21:49:12 +08:00
|
|
|
btrfs_truncate_item(path, item_size, 1);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline_for_stack
|
|
|
|
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 bytenr, u64 num_bytes, u64 parent,
|
|
|
|
u64 root_objectid, u64 owner,
|
|
|
|
u64 offset, int refs_to_add,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op)
|
|
|
|
{
|
|
|
|
struct btrfs_extent_inline_ref *iref;
|
|
|
|
int ret;
|
|
|
|
|
2018-06-20 20:48:48 +08:00
|
|
|
ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
|
|
|
|
num_bytes, parent, root_objectid,
|
|
|
|
owner, offset, 1);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (ret == 0) {
|
|
|
|
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
|
2018-06-20 20:48:49 +08:00
|
|
|
update_inline_extent_backref(path, iref, refs_to_add,
|
|
|
|
extent_op, NULL);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else if (ret == -ENOENT) {
|
2018-06-20 20:49:10 +08:00
|
|
|
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
|
2012-03-01 21:56:26 +08:00
|
|
|
root_objectid, owner, offset,
|
|
|
|
refs_to_add, extent_op);
|
|
|
|
ret = 0;
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int insert_extent_backref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 bytenr, u64 parent, u64 root_objectid,
|
|
|
|
u64 owner, u64 offset, int refs_to_add)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
|
|
|
|
BUG_ON(refs_to_add != 1);
|
2018-06-20 20:48:43 +08:00
|
|
|
ret = insert_tree_block_ref(trans, path, bytenr, parent,
|
|
|
|
root_objectid);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
2018-06-20 20:48:44 +08:00
|
|
|
ret = insert_extent_data_ref(trans, path, bytenr, parent,
|
|
|
|
root_objectid, owner, offset,
|
|
|
|
refs_to_add);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int remove_extent_backref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_extent_inline_ref *iref,
|
2014-05-14 08:30:47 +08:00
|
|
|
int refs_to_drop, int is_data, int *last_ref)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2012-03-01 21:56:26 +08:00
|
|
|
int ret = 0;
|
2009-03-13 23:00:37 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
BUG_ON(!is_data && refs_to_drop != 1);
|
|
|
|
if (iref) {
|
2018-06-20 20:48:49 +08:00
|
|
|
update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
|
|
|
|
last_ref);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else if (is_data) {
|
2018-06-20 20:48:46 +08:00
|
|
|
ret = remove_extent_data_ref(trans, path, refs_to_drop,
|
2014-05-14 08:30:47 +08:00
|
|
|
last_ref);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
2014-05-14 08:30:47 +08:00
|
|
|
*last_ref = 1;
|
2018-06-20 20:49:12 +08:00
|
|
|
ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-06-15 21:41:14 +08:00
|
|
|
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
|
|
|
|
u64 *discarded_bytes)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2015-06-15 21:41:16 +08:00
|
|
|
int j, ret = 0;
|
|
|
|
u64 bytes_left, end;
|
2015-06-15 21:41:15 +08:00
|
|
|
u64 aligned_start = ALIGN(start, 1 << 9);
|
2015-06-15 21:41:14 +08:00
|
|
|
|
2015-06-15 21:41:15 +08:00
|
|
|
if (WARN_ON(start != aligned_start)) {
|
|
|
|
len -= aligned_start - start;
|
|
|
|
len = round_down(len, 1 << 9);
|
|
|
|
start = aligned_start;
|
|
|
|
}
|
2015-06-15 21:41:14 +08:00
|
|
|
|
2015-06-15 21:41:15 +08:00
|
|
|
*discarded_bytes = 0;
|
2015-06-15 21:41:16 +08:00
|
|
|
|
|
|
|
if (!len)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
end = start + len;
|
|
|
|
bytes_left = len;
|
|
|
|
|
|
|
|
/* Skip any superblocks on this device. */
|
|
|
|
for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
|
|
|
|
u64 sb_start = btrfs_sb_offset(j);
|
|
|
|
u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
|
|
|
|
u64 size = sb_start - start;
|
|
|
|
|
|
|
|
if (!in_range(sb_start, start, bytes_left) &&
|
|
|
|
!in_range(sb_end, start, bytes_left) &&
|
|
|
|
!in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Superblock spans beginning of range. Adjust start and
|
|
|
|
* try again.
|
|
|
|
*/
|
|
|
|
if (sb_start <= start) {
|
|
|
|
start += sb_end - start;
|
|
|
|
if (start > end) {
|
|
|
|
bytes_left = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bytes_left = end - start;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (size) {
|
|
|
|
ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
|
|
|
|
GFP_NOFS, 0);
|
|
|
|
if (!ret)
|
|
|
|
*discarded_bytes += size;
|
|
|
|
else if (ret != -EOPNOTSUPP)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
start = sb_end;
|
|
|
|
if (start > end) {
|
|
|
|
bytes_left = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
bytes_left = end - start;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bytes_left) {
|
|
|
|
ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
|
2015-06-15 21:41:15 +08:00
|
|
|
GFP_NOFS, 0);
|
|
|
|
if (!ret)
|
2015-06-15 21:41:16 +08:00
|
|
|
*discarded_bytes += bytes_left;
|
2015-06-15 21:41:15 +08:00
|
|
|
}
|
2015-06-15 21:41:14 +08:00
|
|
|
return ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
|
2014-12-08 22:01:12 +08:00
|
|
|
u64 num_bytes, u64 *actual_bytes)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
|
|
|
int ret;
|
2011-03-24 18:24:27 +08:00
|
|
|
u64 discarded_bytes = 0;
|
2011-08-04 23:15:33 +08:00
|
|
|
struct btrfs_bio *bbio = NULL;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-10-14 21:24:59 +08:00
|
|
|
|
Btrfs: fix race between device replace and discard
While we are finishing a device replace operation, we can make a discard
operation (fs mounted with -o discard) do an invalid memory access like
the one reported by the following trace:
[ 3206.384654] general protection fault: 0000 [#1] PREEMPT SMP
[ 3206.387520] Modules linked in: dm_mod btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis psmouse tpm ppdev sg parport_pc evdev i2c_piix4 parport
processor serio_raw i2c_core pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci
virtio_ring scsi_mod e1000 virtio floppy [last unloaded: btrfs]
[ 3206.388595] CPU: 14 PID: 29194 Comm: fsstress Not tainted 4.6.0-rc7-btrfs-next-29+ #1
[ 3206.388595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 3206.388595] task: ffff88017ace0100 ti: ffff880171b98000 task.ti: ffff880171b98000
[ 3206.388595] RIP: 0010:[<ffffffff8124d233>] [<ffffffff8124d233>] blkdev_issue_discard+0x5c/0x2a7
[ 3206.388595] RSP: 0018:ffff880171b9bb80 EFLAGS: 00010246
[ 3206.388595] RAX: ffff880171b9bc28 RBX: 000000000090d000 RCX: 0000000000000000
[ 3206.388595] RDX: ffffffff82fa1b48 RSI: ffffffff8179f46c RDI: ffffffff82fa1b48
[ 3206.388595] RBP: ffff880171b9bcc0 R08: 0000000000000000 R09: 0000000000000001
[ 3206.388595] R10: ffff880171b9bce0 R11: 000000000090f000 R12: ffff880171b9bbe8
[ 3206.388595] R13: 0000000000000010 R14: 0000000000004868 R15: 6b6b6b6b6b6b6b6b
[ 3206.388595] FS: 00007f6182e4e700(0000) GS:ffff88023fdc0000(0000) knlGS:0000000000000000
[ 3206.388595] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 3206.388595] CR2: 00007f617c2bbb18 CR3: 000000017ad9c000 CR4: 00000000000006e0
[ 3206.388595] Stack:
[ 3206.388595] 0000000000004878 0000000000000000 0000000002400040 0000000000000000
[ 3206.388595] 0000000000000000 ffff880171b9bbe8 ffff880171b9bbb0 ffff880171b9bbb0
[ 3206.388595] ffff880171b9bbc0 ffff880171b9bbc0 ffff880171b9bbd0 ffff880171b9bbd0
[ 3206.388595] Call Trace:
[ 3206.388595] [<ffffffffa042899e>] btrfs_issue_discard+0x12f/0x143 [btrfs]
[ 3206.388595] [<ffffffffa042899e>] ? btrfs_issue_discard+0x12f/0x143 [btrfs]
[ 3206.388595] [<ffffffffa042e862>] btrfs_discard_extent+0x87/0xde [btrfs]
[ 3206.388595] [<ffffffffa04303b5>] btrfs_finish_extent_commit+0xb2/0x1df [btrfs]
[ 3206.388595] [<ffffffff8149c246>] ? __mutex_unlock_slowpath+0x150/0x15b
[ 3206.388595] [<ffffffffa04464c4>] btrfs_commit_transaction+0x7fc/0x980 [btrfs]
[ 3206.388595] [<ffffffff8149c246>] ? __mutex_unlock_slowpath+0x150/0x15b
[ 3206.388595] [<ffffffffa0459af6>] btrfs_sync_file+0x38f/0x428 [btrfs]
[ 3206.388595] [<ffffffff811a8292>] vfs_fsync_range+0x8c/0x9e
[ 3206.388595] [<ffffffff811a82c0>] vfs_fsync+0x1c/0x1e
[ 3206.388595] [<ffffffff811a8417>] do_fsync+0x31/0x4a
[ 3206.388595] [<ffffffff811a8637>] SyS_fsync+0x10/0x14
[ 3206.388595] [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[ 3206.388595] [<ffffffff81100c6b>] ? time_hardirqs_off+0x9/0x14
[ 3206.388595] [<ffffffff8108e87d>] ? trace_hardirqs_off_caller+0x1f/0xaa
This happens because when we call btrfs_map_block() from
btrfs_discard_extent() to get a btrfs_bio structure, the device replace
operation has not finished yet, but before we use the device of one of the
stripes from the returned btrfs_bio structure, the device object is freed.
This is illustrated by the following diagram.
CPU 1 CPU 2
btrfs_dev_replace_start()
(...)
btrfs_dev_replace_finishing()
btrfs_start_transaction()
btrfs_commit_transaction()
(...)
btrfs_sync_file()
btrfs_start_transaction()
(...)
btrfs_commit_transaction()
btrfs_finish_extent_commit()
btrfs_discard_extent()
btrfs_map_block()
--> returns a struct btrfs_bio
with a stripe that has a
device field pointing to
source device of the replace
operation (the device that
is being replaced)
mutex_lock(&uuid_mutex)
mutex_lock(&fs_info->fs_devices->device_list_mutex)
mutex_lock(&fs_info->chunk_mutex)
btrfs_dev_replace_update_device_in_mapping_tree()
--> iterates the mapping tree and for each
extent map that has a stripe pointing to
the source device, it updates the stripe
to point to the target device instead
btrfs_rm_dev_replace_blocked()
--> waits for fs_info->bio_counter to go down to 0
btrfs_rm_dev_replace_remove_srcdev()
--> removes source device from the list of devices
mutex_unlock(&fs_info->chunk_mutex)
mutex_unlock(&fs_info->fs_devices->device_list_mutex)
mutex_unlock(&uuid_mutex)
btrfs_rm_dev_replace_free_srcdev()
--> frees the source device
--> iterates over all stripes
of the returned struct
btrfs_bio
--> for each stripe it
dereferences its device
pointer
--> it ends up finding a
pointer to the device
used as the source
device for the replace
operation and that was
already freed
So fix this by surrounding the call to btrfs_map_block(), and the code
that uses the returned struct btrfs_bio, with calls to
btrfs_bio_counter_inc_blocked() and btrfs_bio_counter_dec(), so that
the finishing phase of the device replace operation blocks until the
the bio counter decreases to zero before it frees the source device.
This is the same approach we do at btrfs_map_bio() for example.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
2016-05-28 00:42:05 +08:00
|
|
|
/*
|
|
|
|
* Avoid races with device replace and make sure our bbio has devices
|
|
|
|
* associated to its stripes that don't go away while we are discarding.
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_bio_counter_inc_blocked(fs_info);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
/* Tell the block device(s) that the sectors can be discarded */
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
|
|
|
|
&bbio, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
/* Error condition is -ENOMEM */
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (!ret) {
|
2011-08-04 23:15:33 +08:00
|
|
|
struct btrfs_bio_stripe *stripe = bbio->stripes;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
|
2011-08-04 23:15:33 +08:00
|
|
|
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
|
2015-06-15 21:41:14 +08:00
|
|
|
u64 bytes;
|
2017-11-29 18:53:43 +08:00
|
|
|
struct request_queue *req_q;
|
|
|
|
|
2018-01-31 02:40:22 +08:00
|
|
|
if (!stripe->dev->bdev) {
|
|
|
|
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
|
|
|
|
continue;
|
|
|
|
}
|
2017-11-29 18:53:43 +08:00
|
|
|
req_q = bdev_get_queue(stripe->dev->bdev);
|
|
|
|
if (!blk_queue_discard(req_q))
|
2011-08-04 22:52:27 +08:00
|
|
|
continue;
|
|
|
|
|
2011-03-24 18:24:27 +08:00
|
|
|
ret = btrfs_issue_discard(stripe->dev->bdev,
|
|
|
|
stripe->physical,
|
2015-06-15 21:41:14 +08:00
|
|
|
stripe->length,
|
|
|
|
&bytes);
|
2011-03-24 18:24:27 +08:00
|
|
|
if (!ret)
|
2015-06-15 21:41:14 +08:00
|
|
|
discarded_bytes += bytes;
|
2011-03-24 18:24:27 +08:00
|
|
|
else if (ret != -EOPNOTSUPP)
|
2012-03-12 23:03:00 +08:00
|
|
|
break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
|
2011-08-04 22:52:27 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Just in case we get back EOPNOTSUPP for some reason,
|
|
|
|
* just ignore the return value so we don't screw up
|
|
|
|
* people calling discard_extent.
|
|
|
|
*/
|
|
|
|
ret = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
2015-01-20 15:11:34 +08:00
|
|
|
btrfs_put_bbio(bbio);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_bio_counter_dec(fs_info);
|
2011-03-24 18:24:27 +08:00
|
|
|
|
|
|
|
if (actual_bytes)
|
|
|
|
*actual_bytes = discarded_bytes;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2013-01-30 07:40:14 +08:00
|
|
|
if (ret == -EOPNOTSUPP)
|
|
|
|
ret = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
/* Can return -ENOMEM */
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
|
2019-04-04 14:45:35 +08:00
|
|
|
struct btrfs_ref *generic_ref)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2019-04-04 14:45:35 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2017-06-07 07:45:31 +08:00
|
|
|
int old_ref_mod, new_ref_mod;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int ret;
|
2011-09-12 21:26:38 +08:00
|
|
|
|
2019-04-04 14:45:35 +08:00
|
|
|
ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
|
|
|
|
generic_ref->action);
|
|
|
|
BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
|
|
|
|
generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2019-04-04 14:45:35 +08:00
|
|
|
if (generic_ref->type == BTRFS_REF_METADATA)
|
|
|
|
ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
|
2019-04-04 14:45:31 +08:00
|
|
|
NULL, &old_ref_mod, &new_ref_mod);
|
2019-04-04 14:45:35 +08:00
|
|
|
else
|
|
|
|
ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
|
2017-06-07 07:45:31 +08:00
|
|
|
&old_ref_mod, &new_ref_mod);
|
|
|
|
|
2019-04-04 14:45:35 +08:00
|
|
|
btrfs_ref_tree_mod(fs_info, generic_ref);
|
2019-04-04 14:45:33 +08:00
|
|
|
|
2019-04-04 14:45:34 +08:00
|
|
|
if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
|
2019-05-15 07:33:48 +08:00
|
|
|
sub_pinned_bytes(fs_info, generic_ref);
|
2017-06-07 07:45:31 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-06-18 19:59:25 +08:00
|
|
|
/*
|
|
|
|
* __btrfs_inc_extent_ref - insert backreference for a given extent
|
|
|
|
*
|
|
|
|
* @trans: Handle of transaction
|
|
|
|
*
|
|
|
|
* @node: The delayed ref node used to get the bytenr/length for
|
|
|
|
* extent whose references are incremented.
|
|
|
|
*
|
|
|
|
* @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
|
|
|
|
* BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
|
|
|
|
* bytenr of the parent block. Since new extents are always
|
|
|
|
* created with indirect references, this will only be the case
|
|
|
|
* when relocating a shared extent. In that case, root_objectid
|
|
|
|
* will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
|
|
|
|
* be 0
|
|
|
|
*
|
|
|
|
* @root_objectid: The id of the root where this modification has originated,
|
|
|
|
* this can be either one of the well-known metadata trees or
|
|
|
|
* the subvolume id which references this extent.
|
|
|
|
*
|
|
|
|
* @owner: For data extents it is the inode number of the owning file.
|
|
|
|
* For metadata extents this parameter holds the level in the
|
|
|
|
* tree of the extent.
|
|
|
|
*
|
|
|
|
* @offset: For metadata extents the offset is ignored and is currently
|
|
|
|
* always passed as 0. For data extents it is the fileoffset
|
|
|
|
* this extent belongs to.
|
|
|
|
*
|
|
|
|
* @refs_to_add Number of references to add
|
|
|
|
*
|
|
|
|
* @extent_op Pointer to a structure, holding information necessary when
|
|
|
|
* updating a tree block's flags
|
|
|
|
*
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
|
2015-03-17 16:59:47 +08:00
|
|
|
struct btrfs_delayed_ref_node *node,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u64 parent, u64 root_objectid,
|
|
|
|
u64 owner, u64 offset, int refs_to_add,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_extent_item *item;
|
2014-05-14 08:30:47 +08:00
|
|
|
struct btrfs_key key;
|
2015-03-17 16:59:47 +08:00
|
|
|
u64 bytenr = node->bytenr;
|
|
|
|
u64 num_bytes = node->num_bytes;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u64 refs;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path->leave_spinning = 1;
|
|
|
|
/* this will setup the path even if it fails to insert the back ref */
|
2018-06-20 20:49:10 +08:00
|
|
|
ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
|
|
|
|
parent, root_objectid, owner,
|
|
|
|
offset, refs_to_add, extent_op);
|
2015-04-16 16:55:08 +08:00
|
|
|
if ((ret < 0 && ret != -EAGAIN) || !ret)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
goto out;
|
2014-05-14 08:30:47 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok we had -EAGAIN which means we didn't have space to insert and
|
|
|
|
* inline extent ref, so just update the reference count and add a
|
|
|
|
* normal backref.
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
leaf = path->nodes[0];
|
2014-05-14 08:30:47 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
|
|
|
refs = btrfs_extent_refs(leaf, item);
|
|
|
|
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
|
|
|
|
if (extent_op)
|
|
|
|
__run_delayed_extent_op(extent_op, leaf, item);
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2009-03-13 22:10:06 +08:00
|
|
|
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
2009-03-13 22:10:06 +08:00
|
|
|
/* now insert the actual backref */
|
2018-06-20 20:48:45 +08:00
|
|
|
ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
|
|
|
|
owner, offset, refs_to_add);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret)
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
out:
|
2009-03-13 22:10:06 +08:00
|
|
|
btrfs_free_path(path);
|
2013-10-11 16:30:23 +08:00
|
|
|
return ret;
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_delayed_ref_node *node,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op,
|
|
|
|
int insert_reserved)
|
2009-03-13 22:10:06 +08:00
|
|
|
{
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_delayed_data_ref *ref;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 parent = 0;
|
|
|
|
u64 ref_root = 0;
|
|
|
|
u64 flags = 0;
|
|
|
|
|
|
|
|
ins.objectid = node->bytenr;
|
|
|
|
ins.offset = node->num_bytes;
|
|
|
|
ins.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
|
|
|
|
ref = btrfs_delayed_node_to_data_ref(node);
|
2018-06-20 20:49:00 +08:00
|
|
|
trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
|
2013-07-16 19:03:36 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
|
|
|
|
parent = ref->parent;
|
2014-05-14 08:30:47 +08:00
|
|
|
ref_root = ref->root;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
|
2013-03-08 03:22:04 +08:00
|
|
|
if (extent_op)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
flags |= extent_op->flags_to_set;
|
2018-06-20 20:48:58 +08:00
|
|
|
ret = alloc_reserved_file_extent(trans, parent, ref_root,
|
|
|
|
flags, ref->objectid,
|
|
|
|
ref->offset, &ins,
|
|
|
|
node->ref_mod);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
|
2018-06-20 20:48:59 +08:00
|
|
|
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
|
|
|
|
ref->objectid, ref->offset,
|
|
|
|
node->ref_mod, extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
|
2018-06-20 20:48:57 +08:00
|
|
|
ret = __btrfs_free_extent(trans, node, parent,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ref_root, ref->objectid,
|
|
|
|
ref->offset, node->ref_mod,
|
2015-03-17 16:59:47 +08:00
|
|
|
extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
|
|
|
|
struct extent_buffer *leaf,
|
|
|
|
struct btrfs_extent_item *ei)
|
|
|
|
{
|
|
|
|
u64 flags = btrfs_extent_flags(leaf, ei);
|
|
|
|
if (extent_op->update_flags) {
|
|
|
|
flags |= extent_op->flags_to_set;
|
|
|
|
btrfs_set_extent_flags(leaf, ei, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (extent_op->update_key) {
|
|
|
|
struct btrfs_tree_block_info *bi;
|
|
|
|
BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
|
|
|
|
bi = (struct btrfs_tree_block_info *)(ei + 1);
|
|
|
|
btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
|
2017-09-30 03:43:57 +08:00
|
|
|
struct btrfs_delayed_ref_head *head,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_delayed_extent_op *extent_op)
|
|
|
|
{
|
2018-06-20 20:49:01 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
u32 item_size;
|
2009-03-13 22:10:06 +08:00
|
|
|
int ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int err = 0;
|
2013-05-10 01:49:30 +08:00
|
|
|
int metadata = !extent_op->is_data;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
if (trans->aborted)
|
|
|
|
return 0;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
|
2013-03-08 03:22:04 +08:00
|
|
|
metadata = 0;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2017-09-30 03:43:57 +08:00
|
|
|
key.objectid = head->bytenr;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (metadata) {
|
|
|
|
key.type = BTRFS_METADATA_ITEM_KEY;
|
2013-05-10 01:49:30 +08:00
|
|
|
key.offset = extent_op->level;
|
2013-03-08 03:22:04 +08:00
|
|
|
} else {
|
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
2017-09-30 03:43:57 +08:00
|
|
|
key.offset = head->num_bytes;
|
2013-03-08 03:22:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
again:
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path->leave_spinning = 1;
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (ret > 0) {
|
2013-03-08 03:22:04 +08:00
|
|
|
if (metadata) {
|
2013-10-18 22:42:56 +08:00
|
|
|
if (path->slots[0] > 0) {
|
|
|
|
path->slots[0]--;
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
|
|
|
path->slots[0]);
|
2017-09-30 03:43:57 +08:00
|
|
|
if (key.objectid == head->bytenr &&
|
2013-10-18 22:42:56 +08:00
|
|
|
key.type == BTRFS_EXTENT_ITEM_KEY &&
|
2017-09-30 03:43:57 +08:00
|
|
|
key.offset == head->num_bytes)
|
2013-10-18 22:42:56 +08:00
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
if (ret > 0) {
|
|
|
|
btrfs_release_path(path);
|
|
|
|
metadata = 0;
|
2013-03-08 03:22:04 +08:00
|
|
|
|
2017-09-30 03:43:57 +08:00
|
|
|
key.objectid = head->bytenr;
|
|
|
|
key.offset = head->num_bytes;
|
2013-10-18 22:42:56 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
err = -EIO;
|
|
|
|
goto out;
|
2013-03-08 03:22:04 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
2018-06-26 21:57:36 +08:00
|
|
|
|
2018-06-26 22:20:59 +08:00
|
|
|
if (unlikely(item_size < sizeof(*ei))) {
|
2018-06-26 21:57:36 +08:00
|
|
|
err = -EINVAL;
|
|
|
|
btrfs_print_v0_err(fs_info);
|
|
|
|
btrfs_abort_transaction(trans, err);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
|
|
|
__run_delayed_extent_op(extent_op, leaf, ei);
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return err;
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_delayed_ref_node *node,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op,
|
|
|
|
int insert_reserved)
|
2009-03-13 22:10:06 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_delayed_tree_ref *ref;
|
|
|
|
u64 parent = 0;
|
|
|
|
u64 ref_root = 0;
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ref = btrfs_delayed_node_to_tree_ref(node);
|
2018-06-20 20:49:04 +08:00
|
|
|
trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
|
2013-07-16 19:03:36 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
|
|
|
|
parent = ref->parent;
|
2014-05-14 08:30:47 +08:00
|
|
|
ref_root = ref->root;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2016-09-15 10:19:05 +08:00
|
|
|
if (node->ref_mod != 1) {
|
2018-06-20 20:49:04 +08:00
|
|
|
btrfs_err(trans->fs_info,
|
2016-09-15 10:19:05 +08:00
|
|
|
"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
|
|
|
|
node->bytenr, node->ref_mod, node->action, ref_root,
|
|
|
|
parent);
|
|
|
|
return -EIO;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
|
2013-03-08 03:22:04 +08:00
|
|
|
BUG_ON(!extent_op || !extent_op->update_flags);
|
2018-05-21 17:27:22 +08:00
|
|
|
ret = alloc_reserved_tree_block(trans, node, extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
|
2018-06-20 20:48:59 +08:00
|
|
|
ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
|
|
|
|
ref->level, 0, 1, extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
|
2018-06-20 20:48:57 +08:00
|
|
|
ret = __btrfs_free_extent(trans, node, parent, ref_root,
|
2015-03-17 16:59:47 +08:00
|
|
|
ref->level, 0, 1, extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
|
|
|
BUG();
|
|
|
|
}
|
2009-03-13 22:10:06 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* helper function to actually process a single delayed ref entry */
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_delayed_ref_node *node,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op,
|
|
|
|
int insert_reserved)
|
2009-03-13 22:10:06 +08:00
|
|
|
{
|
2012-03-12 23:03:00 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2013-10-08 03:21:08 +08:00
|
|
|
if (trans->aborted) {
|
|
|
|
if (insert_reserved)
|
2018-06-20 20:49:11 +08:00
|
|
|
btrfs_pin_extent(trans->fs_info, node->bytenr,
|
2013-10-08 03:21:08 +08:00
|
|
|
node->num_bytes, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
return 0;
|
2013-10-08 03:21:08 +08:00
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
|
|
|
|
node->type == BTRFS_SHARED_BLOCK_REF_KEY)
|
2018-06-20 20:49:04 +08:00
|
|
|
ret = run_delayed_tree_ref(trans, node, extent_op,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
insert_reserved);
|
|
|
|
else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
|
|
|
|
node->type == BTRFS_SHARED_DATA_REF_KEY)
|
2018-06-20 20:49:00 +08:00
|
|
|
ret = run_delayed_data_ref(trans, node, extent_op,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
insert_reserved);
|
|
|
|
else
|
|
|
|
BUG();
|
2018-10-12 03:54:22 +08:00
|
|
|
if (ret && insert_reserved)
|
|
|
|
btrfs_pin_extent(trans->fs_info, node->bytenr,
|
|
|
|
node->num_bytes, 1);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
|
|
|
|
2015-03-30 17:03:00 +08:00
|
|
|
static inline struct btrfs_delayed_ref_node *
|
2009-03-13 22:10:06 +08:00
|
|
|
select_delayed_ref(struct btrfs_delayed_ref_head *head)
|
|
|
|
{
|
Btrfs: fix order by which delayed references are run
When we have an extent that got N references removed and N new references
added in the same transaction, we must run the insertion of the references
first because otherwise the last removed reference will remove the extent
item from the extent tree, resulting in a failure for the insertions.
This is a regression introduced in the 4.2-rc1 release and this fix just
brings back the behaviour of selecting reference additions before any
reference removals.
The following test case for fstests reproduces the issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_cloner
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create prealloc extent covering range [160K, 620K[
$XFS_IO_PROG -f -c "falloc 160K 460K" $SCRATCH_MNT/foo
# Now write to the last 80K of the prealloc extent plus 40K to the unallocated
# space that immediately follows it. This creates a new extent of 40K that spans
# the range [620K, 660K[.
$XFS_IO_PROG -c "pwrite -S 0xaa 540K 120K" $SCRATCH_MNT/foo | _filter_xfs_io
# At this point, there are now 2 back references to the prealloc extent in our
# extent tree. Both are for our file offset 160K and one relates to a file
# extent item with a data offset of 0 and a length of 380K, while the other
# relates to a file extent item with a data offset of 380K and a length of 80K.
# Make sure everything done so far is durably persisted (all back references are
# in the extent tree, etc).
sync
# Now clone all extents of our file that cover the offset 160K up to its eof
# (660K at this point) into itself at offset 2M. This leaves a hole in the file
# covering the range [660K, 2M[. The prealloc extent will now be referenced by
# the file twice, once for offset 160K and once for offset 2M. The 40K extent
# that follows the prealloc extent will also be referenced twice by our file,
# once for offset 620K and once for offset 2M + 460K.
$CLONER_PROG -s $((160 * 1024)) -d $((2 * 1024 * 1024)) -l 0 $SCRATCH_MNT/foo \
$SCRATCH_MNT/foo
# Now create one new extent in our file with a size of 100Kb. It will span the
# range [3M, 3M + 100K[. It also will cause creation of a hole spanning the
# range [2M + 460K, 3M[. Our new file size is 3M + 100K.
$XFS_IO_PROG -c "pwrite -S 0xbb 3M 100K" $SCRATCH_MNT/foo | _filter_xfs_io
# At this point, there are now (in memory) 4 back references to the prealloc
# extent.
#
# Two of them are for file offset 160K, related to file extent items
# matching the file offsets 160K and 540K respectively, with data offsets of
# 0 and 380K respectively, and with lengths of 380K and 80K respectively.
#
# The other two references are for file offset 2M, related to file extent items
# matching the file offsets 2M and 2M + 380K respectively, with data offsets of
# 0 and 380K respectively, and with lengths of 389K and 80K respectively.
#
# The 40K extent has 2 back references, one for file offset 620K and the other
# for file offset 2M + 460K.
#
# The 100K extent has a single back reference and it relates to file offset 3M.
# Now clone our 100K extent into offset 600K. That offset covers the last 20K
# of the prealloc extent, the whole 40K extent and 40K of the hole starting at
# offset 660K.
$CLONER_PROG -s $((3 * 1024 * 1024)) -d $((600 * 1024)) -l $((100 * 1024)) \
$SCRATCH_MNT/foo $SCRATCH_MNT/foo
# At this point there's only one reference to the 40K extent, at file offset
# 2M + 460K, we have 4 references for the prealloc extent (2 for file offset
# 160K and 2 for file offset 2M) and 2 references for the 100K extent (1 for
# file offset 3M and a new one for file offset 600K).
# Now fsync our file to make all its new data and metadata updates are durably
# persisted and present if a power failure/crash happens after a successful
# fsync and before the next transaction commit.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File digest before power failure:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
# Silently drop all writes and ummount to simulate a crash/power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again, mount to trigger log replay and validate file contents.
# During log replay, the btrfs delayed references implementation used to run the
# deletion of back references before the addition of new back references, which
# made the addition fail as it didn't find the key in the extent tree that it
# was looking for. The failure triggered by this test was related to the 40K
# extent, which got 1 reference dropped and 1 reference added during the fsync
# log replay - when running the delayed references at transaction commit time,
# btrfs was applying the deletion before the insertion, resulting in a failure
# of the insertion that ended up turning the fs into read-only mode.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
echo "File digest after log replay:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
_unmount_flakey
status=0
exit
This issue turned the filesystem into read-only mode (current transaction
aborted) and produced the following traces:
[ 8247.578385] ------------[ cut here ]------------
[ 8247.579947] WARNING: CPU: 0 PID: 11341 at fs/btrfs/extent-tree.c:1547 lookup_inline_extent_backref+0x17d/0x45d [btrfs]()
(...)
[ 8247.601697] Call Trace:
[ 8247.602222] [<ffffffff8145f077>] dump_stack+0x4f/0x7b
[ 8247.604320] [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
[ 8247.605488] [<ffffffffa0506c8d>] ? lookup_inline_extent_backref+0x17d/0x45d [btrfs]
[ 8247.608226] [<ffffffffa0506c8d>] lookup_inline_extent_backref+0x17d/0x45d [btrfs]
[ 8247.617061] [<ffffffffa0507957>] insert_inline_extent_backref+0x41/0xb2 [btrfs]
[ 8247.621856] [<ffffffffa0507c4f>] __btrfs_inc_extent_ref+0x8c/0x20a [btrfs]
[ 8247.624366] [<ffffffffa050ee60>] __btrfs_run_delayed_refs+0xb0c/0xd49 [btrfs]
[ 8247.626176] [<ffffffffa0510dcd>] btrfs_run_delayed_refs+0x6d/0x1d4 [btrfs]
[ 8247.627435] [<ffffffff81155c9b>] ? __cache_free+0x4a7/0x4b6
[ 8247.628531] [<ffffffffa0520482>] btrfs_commit_transaction+0x4c/0xa20 [btrfs]
(...)
[ 8247.648430] ---[ end trace 2461e55f92c2ac2d ]---
[ 8247.727263] WARNING: CPU: 3 PID: 11341 at fs/btrfs/extent-tree.c:2771 btrfs_run_delayed_refs+0xa4/0x1d4 [btrfs]()
[ 8247.728954] BTRFS: Transaction aborted (error -5)
(...)
[ 8247.760866] Call Trace:
[ 8247.761534] [<ffffffff8145f077>] dump_stack+0x4f/0x7b
[ 8247.764271] [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
[ 8247.767582] [<ffffffffa0510e04>] ? btrfs_run_delayed_refs+0xa4/0x1d4 [btrfs]
[ 8247.769373] [<ffffffff8104b410>] warn_slowpath_fmt+0x46/0x48
[ 8247.770836] [<ffffffffa0510e04>] btrfs_run_delayed_refs+0xa4/0x1d4 [btrfs]
[ 8247.772532] [<ffffffff81155c9b>] ? __cache_free+0x4a7/0x4b6
[ 8247.773664] [<ffffffffa0520482>] btrfs_commit_transaction+0x4c/0xa20 [btrfs]
[ 8247.775047] [<ffffffff81087310>] ? trace_hardirqs_on+0xd/0xf
[ 8247.776176] [<ffffffff81155dd5>] ? kmem_cache_free+0x12b/0x189
[ 8247.777427] [<ffffffffa055a920>] btrfs_recover_log_trees+0x2da/0x33d [btrfs]
[ 8247.778575] [<ffffffffa055898e>] ? replay_one_extent+0x4fc/0x4fc [btrfs]
[ 8247.779838] [<ffffffffa051e265>] open_ctree+0x1cc0/0x201a [btrfs]
[ 8247.781020] [<ffffffff81120f48>] ? register_shrinker+0x56/0x81
[ 8247.782285] [<ffffffffa04fb12c>] btrfs_mount+0x5f0/0x734 [btrfs]
(...)
[ 8247.793394] ---[ end trace 2461e55f92c2ac2e ]---
[ 8247.794276] BTRFS: error (device dm-0) in btrfs_run_delayed_refs:2771: errno=-5 IO failure
[ 8247.797335] BTRFS: error (device dm-0) in btrfs_replay_log:2375: errno=-5 IO failure (Failed to recover log tree)
Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Acked-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-07-09 20:13:44 +08:00
|
|
|
struct btrfs_delayed_ref_node *ref;
|
|
|
|
|
2018-08-23 03:51:50 +08:00
|
|
|
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
|
2015-03-30 17:03:00 +08:00
|
|
|
return NULL;
|
2014-01-23 22:21:38 +08:00
|
|
|
|
Btrfs: fix order by which delayed references are run
When we have an extent that got N references removed and N new references
added in the same transaction, we must run the insertion of the references
first because otherwise the last removed reference will remove the extent
item from the extent tree, resulting in a failure for the insertions.
This is a regression introduced in the 4.2-rc1 release and this fix just
brings back the behaviour of selecting reference additions before any
reference removals.
The following test case for fstests reproduces the issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_cloner
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create prealloc extent covering range [160K, 620K[
$XFS_IO_PROG -f -c "falloc 160K 460K" $SCRATCH_MNT/foo
# Now write to the last 80K of the prealloc extent plus 40K to the unallocated
# space that immediately follows it. This creates a new extent of 40K that spans
# the range [620K, 660K[.
$XFS_IO_PROG -c "pwrite -S 0xaa 540K 120K" $SCRATCH_MNT/foo | _filter_xfs_io
# At this point, there are now 2 back references to the prealloc extent in our
# extent tree. Both are for our file offset 160K and one relates to a file
# extent item with a data offset of 0 and a length of 380K, while the other
# relates to a file extent item with a data offset of 380K and a length of 80K.
# Make sure everything done so far is durably persisted (all back references are
# in the extent tree, etc).
sync
# Now clone all extents of our file that cover the offset 160K up to its eof
# (660K at this point) into itself at offset 2M. This leaves a hole in the file
# covering the range [660K, 2M[. The prealloc extent will now be referenced by
# the file twice, once for offset 160K and once for offset 2M. The 40K extent
# that follows the prealloc extent will also be referenced twice by our file,
# once for offset 620K and once for offset 2M + 460K.
$CLONER_PROG -s $((160 * 1024)) -d $((2 * 1024 * 1024)) -l 0 $SCRATCH_MNT/foo \
$SCRATCH_MNT/foo
# Now create one new extent in our file with a size of 100Kb. It will span the
# range [3M, 3M + 100K[. It also will cause creation of a hole spanning the
# range [2M + 460K, 3M[. Our new file size is 3M + 100K.
$XFS_IO_PROG -c "pwrite -S 0xbb 3M 100K" $SCRATCH_MNT/foo | _filter_xfs_io
# At this point, there are now (in memory) 4 back references to the prealloc
# extent.
#
# Two of them are for file offset 160K, related to file extent items
# matching the file offsets 160K and 540K respectively, with data offsets of
# 0 and 380K respectively, and with lengths of 380K and 80K respectively.
#
# The other two references are for file offset 2M, related to file extent items
# matching the file offsets 2M and 2M + 380K respectively, with data offsets of
# 0 and 380K respectively, and with lengths of 389K and 80K respectively.
#
# The 40K extent has 2 back references, one for file offset 620K and the other
# for file offset 2M + 460K.
#
# The 100K extent has a single back reference and it relates to file offset 3M.
# Now clone our 100K extent into offset 600K. That offset covers the last 20K
# of the prealloc extent, the whole 40K extent and 40K of the hole starting at
# offset 660K.
$CLONER_PROG -s $((3 * 1024 * 1024)) -d $((600 * 1024)) -l $((100 * 1024)) \
$SCRATCH_MNT/foo $SCRATCH_MNT/foo
# At this point there's only one reference to the 40K extent, at file offset
# 2M + 460K, we have 4 references for the prealloc extent (2 for file offset
# 160K and 2 for file offset 2M) and 2 references for the 100K extent (1 for
# file offset 3M and a new one for file offset 600K).
# Now fsync our file to make all its new data and metadata updates are durably
# persisted and present if a power failure/crash happens after a successful
# fsync and before the next transaction commit.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File digest before power failure:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
# Silently drop all writes and ummount to simulate a crash/power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again, mount to trigger log replay and validate file contents.
# During log replay, the btrfs delayed references implementation used to run the
# deletion of back references before the addition of new back references, which
# made the addition fail as it didn't find the key in the extent tree that it
# was looking for. The failure triggered by this test was related to the 40K
# extent, which got 1 reference dropped and 1 reference added during the fsync
# log replay - when running the delayed references at transaction commit time,
# btrfs was applying the deletion before the insertion, resulting in a failure
# of the insertion that ended up turning the fs into read-only mode.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
echo "File digest after log replay:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
_unmount_flakey
status=0
exit
This issue turned the filesystem into read-only mode (current transaction
aborted) and produced the following traces:
[ 8247.578385] ------------[ cut here ]------------
[ 8247.579947] WARNING: CPU: 0 PID: 11341 at fs/btrfs/extent-tree.c:1547 lookup_inline_extent_backref+0x17d/0x45d [btrfs]()
(...)
[ 8247.601697] Call Trace:
[ 8247.602222] [<ffffffff8145f077>] dump_stack+0x4f/0x7b
[ 8247.604320] [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
[ 8247.605488] [<ffffffffa0506c8d>] ? lookup_inline_extent_backref+0x17d/0x45d [btrfs]
[ 8247.608226] [<ffffffffa0506c8d>] lookup_inline_extent_backref+0x17d/0x45d [btrfs]
[ 8247.617061] [<ffffffffa0507957>] insert_inline_extent_backref+0x41/0xb2 [btrfs]
[ 8247.621856] [<ffffffffa0507c4f>] __btrfs_inc_extent_ref+0x8c/0x20a [btrfs]
[ 8247.624366] [<ffffffffa050ee60>] __btrfs_run_delayed_refs+0xb0c/0xd49 [btrfs]
[ 8247.626176] [<ffffffffa0510dcd>] btrfs_run_delayed_refs+0x6d/0x1d4 [btrfs]
[ 8247.627435] [<ffffffff81155c9b>] ? __cache_free+0x4a7/0x4b6
[ 8247.628531] [<ffffffffa0520482>] btrfs_commit_transaction+0x4c/0xa20 [btrfs]
(...)
[ 8247.648430] ---[ end trace 2461e55f92c2ac2d ]---
[ 8247.727263] WARNING: CPU: 3 PID: 11341 at fs/btrfs/extent-tree.c:2771 btrfs_run_delayed_refs+0xa4/0x1d4 [btrfs]()
[ 8247.728954] BTRFS: Transaction aborted (error -5)
(...)
[ 8247.760866] Call Trace:
[ 8247.761534] [<ffffffff8145f077>] dump_stack+0x4f/0x7b
[ 8247.764271] [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
[ 8247.767582] [<ffffffffa0510e04>] ? btrfs_run_delayed_refs+0xa4/0x1d4 [btrfs]
[ 8247.769373] [<ffffffff8104b410>] warn_slowpath_fmt+0x46/0x48
[ 8247.770836] [<ffffffffa0510e04>] btrfs_run_delayed_refs+0xa4/0x1d4 [btrfs]
[ 8247.772532] [<ffffffff81155c9b>] ? __cache_free+0x4a7/0x4b6
[ 8247.773664] [<ffffffffa0520482>] btrfs_commit_transaction+0x4c/0xa20 [btrfs]
[ 8247.775047] [<ffffffff81087310>] ? trace_hardirqs_on+0xd/0xf
[ 8247.776176] [<ffffffff81155dd5>] ? kmem_cache_free+0x12b/0x189
[ 8247.777427] [<ffffffffa055a920>] btrfs_recover_log_trees+0x2da/0x33d [btrfs]
[ 8247.778575] [<ffffffffa055898e>] ? replay_one_extent+0x4fc/0x4fc [btrfs]
[ 8247.779838] [<ffffffffa051e265>] open_ctree+0x1cc0/0x201a [btrfs]
[ 8247.781020] [<ffffffff81120f48>] ? register_shrinker+0x56/0x81
[ 8247.782285] [<ffffffffa04fb12c>] btrfs_mount+0x5f0/0x734 [btrfs]
(...)
[ 8247.793394] ---[ end trace 2461e55f92c2ac2e ]---
[ 8247.794276] BTRFS: error (device dm-0) in btrfs_run_delayed_refs:2771: errno=-5 IO failure
[ 8247.797335] BTRFS: error (device dm-0) in btrfs_replay_log:2375: errno=-5 IO failure (Failed to recover log tree)
Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Acked-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-07-09 20:13:44 +08:00
|
|
|
/*
|
|
|
|
* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
|
|
|
|
* This is to prevent a ref count from going down to zero, which deletes
|
|
|
|
* the extent item from the extent tree, when there still are references
|
|
|
|
* to add, which would fail because they would not find the extent item.
|
|
|
|
*/
|
btrfs: improve delayed refs iterations
This issue was found when I tried to delete a heavily reflinked file,
when deleting such files, other transaction operation will not have a
chance to make progress, for example, start_transaction() will blocked
in wait_current_trans(root) for long time, sometimes it even triggers
soft lockups, and the time taken to delete such heavily reflinked file
is also very large, often hundreds of seconds. Using perf top, it reports
that:
PerfTop: 7416 irqs/sec kernel:99.8% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
---------------------------------------------------------------------------------------
84.37% [btrfs] [k] __btrfs_run_delayed_refs.constprop.80
11.02% [kernel] [k] delay_tsc
0.79% [kernel] [k] _raw_spin_unlock_irq
0.78% [kernel] [k] _raw_spin_unlock_irqrestore
0.45% [kernel] [k] do_raw_spin_lock
0.18% [kernel] [k] __slab_alloc
It seems __btrfs_run_delayed_refs() took most cpu time, after some debug
work, I found it's select_delayed_ref() causing this issue, for a delayed
head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but
select_delayed_ref() will firstly try to iterate node list to find
BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and
waste much time.
To fix this issue, we introduce a new ref_add_list in struct btrfs_delayed_ref_head,
then in select_delayed_ref(), if this list is not empty, we can directly use
nodes in this list. With this patch, it just took about 10~15 seconds to
delte the same file. Now using perf top, it reports that:
PerfTop: 2734 irqs/sec kernel:99.5% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
----------------------------------------------------------------------------------------
20.74% [kernel] [k] _raw_spin_unlock_irqrestore
16.33% [kernel] [k] __slab_alloc
5.41% [kernel] [k] lock_acquired
4.42% [kernel] [k] lock_acquire
4.05% [kernel] [k] lock_release
3.37% [kernel] [k] _raw_spin_unlock_irq
For normal files, this patch also gives help, at least we do not need to
iterate whole list to found BTRFS_ADD_DELAYED_REF nodes.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-10-26 18:07:33 +08:00
|
|
|
if (!list_empty(&head->ref_add_list))
|
|
|
|
return list_first_entry(&head->ref_add_list,
|
|
|
|
struct btrfs_delayed_ref_node, add_list);
|
|
|
|
|
2018-08-23 03:51:50 +08:00
|
|
|
ref = rb_entry(rb_first_cached(&head->ref_tree),
|
2017-10-20 02:16:00 +08:00
|
|
|
struct btrfs_delayed_ref_node, ref_node);
|
btrfs: improve delayed refs iterations
This issue was found when I tried to delete a heavily reflinked file,
when deleting such files, other transaction operation will not have a
chance to make progress, for example, start_transaction() will blocked
in wait_current_trans(root) for long time, sometimes it even triggers
soft lockups, and the time taken to delete such heavily reflinked file
is also very large, often hundreds of seconds. Using perf top, it reports
that:
PerfTop: 7416 irqs/sec kernel:99.8% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
---------------------------------------------------------------------------------------
84.37% [btrfs] [k] __btrfs_run_delayed_refs.constprop.80
11.02% [kernel] [k] delay_tsc
0.79% [kernel] [k] _raw_spin_unlock_irq
0.78% [kernel] [k] _raw_spin_unlock_irqrestore
0.45% [kernel] [k] do_raw_spin_lock
0.18% [kernel] [k] __slab_alloc
It seems __btrfs_run_delayed_refs() took most cpu time, after some debug
work, I found it's select_delayed_ref() causing this issue, for a delayed
head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but
select_delayed_ref() will firstly try to iterate node list to find
BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and
waste much time.
To fix this issue, we introduce a new ref_add_list in struct btrfs_delayed_ref_head,
then in select_delayed_ref(), if this list is not empty, we can directly use
nodes in this list. With this patch, it just took about 10~15 seconds to
delte the same file. Now using perf top, it reports that:
PerfTop: 2734 irqs/sec kernel:99.5% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
----------------------------------------------------------------------------------------
20.74% [kernel] [k] _raw_spin_unlock_irqrestore
16.33% [kernel] [k] __slab_alloc
5.41% [kernel] [k] lock_acquired
4.42% [kernel] [k] lock_acquire
4.05% [kernel] [k] lock_release
3.37% [kernel] [k] _raw_spin_unlock_irq
For normal files, this patch also gives help, at least we do not need to
iterate whole list to found BTRFS_ADD_DELAYED_REF nodes.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-10-26 18:07:33 +08:00
|
|
|
ASSERT(list_empty(&ref->add_list));
|
|
|
|
return ref;
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
|
|
|
|
2017-09-30 03:43:52 +08:00
|
|
|
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
struct btrfs_delayed_ref_head *head)
|
|
|
|
{
|
|
|
|
spin_lock(&delayed_refs->lock);
|
|
|
|
head->processing = 0;
|
|
|
|
delayed_refs->num_heads_ready++;
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
btrfs_delayed_ref_unlock(head);
|
|
|
|
}
|
|
|
|
|
2018-12-03 23:20:31 +08:00
|
|
|
static struct btrfs_delayed_extent_op *cleanup_extent_op(
|
|
|
|
struct btrfs_delayed_ref_head *head)
|
2017-09-30 03:43:53 +08:00
|
|
|
{
|
|
|
|
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
|
|
|
|
|
|
|
|
if (!extent_op)
|
2018-12-03 23:20:31 +08:00
|
|
|
return NULL;
|
|
|
|
|
2017-09-30 03:43:53 +08:00
|
|
|
if (head->must_insert_reserved) {
|
2018-12-03 23:20:31 +08:00
|
|
|
head->extent_op = NULL;
|
2017-09-30 03:43:53 +08:00
|
|
|
btrfs_free_delayed_extent_op(extent_op);
|
2018-12-03 23:20:31 +08:00
|
|
|
return NULL;
|
2017-09-30 03:43:53 +08:00
|
|
|
}
|
2018-12-03 23:20:31 +08:00
|
|
|
return extent_op;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_delayed_ref_head *head)
|
|
|
|
{
|
|
|
|
struct btrfs_delayed_extent_op *extent_op;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
extent_op = cleanup_extent_op(head);
|
|
|
|
if (!extent_op)
|
|
|
|
return 0;
|
|
|
|
head->extent_op = NULL;
|
2017-09-30 03:43:53 +08:00
|
|
|
spin_unlock(&head->lock);
|
2018-06-20 20:49:01 +08:00
|
|
|
ret = run_delayed_extent_op(trans, head, extent_op);
|
2017-09-30 03:43:53 +08:00
|
|
|
btrfs_free_delayed_extent_op(extent_op);
|
|
|
|
return ret ? ret : 1;
|
|
|
|
}
|
|
|
|
|
2018-11-22 03:05:41 +08:00
|
|
|
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
struct btrfs_delayed_ref_head *head)
|
2018-12-03 23:20:30 +08:00
|
|
|
{
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
int nr_items = 1; /* Dropping this ref head update. */
|
2018-12-03 23:20:30 +08:00
|
|
|
|
|
|
|
if (head->total_ref_mod < 0) {
|
|
|
|
struct btrfs_space_info *space_info;
|
|
|
|
u64 flags;
|
|
|
|
|
|
|
|
if (head->is_data)
|
|
|
|
flags = BTRFS_BLOCK_GROUP_DATA;
|
|
|
|
else if (head->is_system)
|
|
|
|
flags = BTRFS_BLOCK_GROUP_SYSTEM;
|
|
|
|
else
|
|
|
|
flags = BTRFS_BLOCK_GROUP_METADATA;
|
2019-06-19 04:09:19 +08:00
|
|
|
space_info = btrfs_find_space_info(fs_info, flags);
|
2018-12-03 23:20:30 +08:00
|
|
|
ASSERT(space_info);
|
|
|
|
percpu_counter_add_batch(&space_info->total_bytes_pinned,
|
|
|
|
-head->num_bytes,
|
|
|
|
BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
|
|
|
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
/*
|
|
|
|
* We had csum deletions accounted for in our delayed refs rsv,
|
|
|
|
* we need to drop the csum leaves for this update from our
|
|
|
|
* delayed_refs_rsv.
|
|
|
|
*/
|
2018-12-03 23:20:30 +08:00
|
|
|
if (head->is_data) {
|
|
|
|
spin_lock(&delayed_refs->lock);
|
|
|
|
delayed_refs->pending_csums -= head->num_bytes;
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
nr_items += btrfs_csum_bytes_to_leaves(fs_info,
|
|
|
|
head->num_bytes);
|
2018-12-03 23:20:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
btrfs_delayed_refs_rsv_release(fs_info, nr_items);
|
2018-12-03 23:20:30 +08:00
|
|
|
}
|
|
|
|
|
2017-09-30 03:43:54 +08:00
|
|
|
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_delayed_ref_head *head)
|
|
|
|
{
|
2018-06-20 20:49:03 +08:00
|
|
|
|
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2017-09-30 03:43:54 +08:00
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
|
|
|
2018-12-03 23:20:31 +08:00
|
|
|
ret = run_and_cleanup_extent_op(trans, head);
|
2017-09-30 03:43:54 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
unselect_delayed_ref_head(delayed_refs, head);
|
|
|
|
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
|
|
|
|
return ret;
|
|
|
|
} else if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Need to drop our head ref lock and re-acquire the delayed ref lock
|
|
|
|
* and then re-check to make sure nobody got added.
|
|
|
|
*/
|
|
|
|
spin_unlock(&head->lock);
|
|
|
|
spin_lock(&delayed_refs->lock);
|
|
|
|
spin_lock(&head->lock);
|
2018-08-23 03:51:50 +08:00
|
|
|
if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
|
2017-09-30 03:43:54 +08:00
|
|
|
spin_unlock(&head->lock);
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
return 1;
|
|
|
|
}
|
2018-12-03 23:20:29 +08:00
|
|
|
btrfs_delete_ref_head(delayed_refs, head);
|
2017-09-30 03:43:56 +08:00
|
|
|
spin_unlock(&head->lock);
|
2018-04-11 16:21:18 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
2017-09-30 03:43:56 +08:00
|
|
|
|
|
|
|
if (head->must_insert_reserved) {
|
2017-09-30 03:43:57 +08:00
|
|
|
btrfs_pin_extent(fs_info, head->bytenr,
|
|
|
|
head->num_bytes, 1);
|
2017-09-30 03:43:56 +08:00
|
|
|
if (head->is_data) {
|
2017-09-30 03:43:57 +08:00
|
|
|
ret = btrfs_del_csums(trans, fs_info, head->bytenr,
|
|
|
|
head->num_bytes);
|
2017-09-30 03:43:56 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-22 03:05:41 +08:00
|
|
|
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
|
2018-12-03 23:20:30 +08:00
|
|
|
|
|
|
|
trace_run_delayed_ref_head(fs_info, head, 0);
|
2017-09-30 03:43:56 +08:00
|
|
|
btrfs_delayed_ref_unlock(head);
|
2017-09-30 03:43:57 +08:00
|
|
|
btrfs_put_delayed_ref_head(head);
|
2017-09-30 03:43:54 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-08-15 15:39:54 +08:00
|
|
|
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
|
|
|
|
struct btrfs_trans_handle *trans)
|
|
|
|
{
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs =
|
|
|
|
&trans->transaction->delayed_refs;
|
|
|
|
struct btrfs_delayed_ref_head *head = NULL;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
spin_lock(&delayed_refs->lock);
|
2018-10-11 13:40:33 +08:00
|
|
|
head = btrfs_select_ref_head(delayed_refs);
|
2018-08-15 15:39:54 +08:00
|
|
|
if (!head) {
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
return head;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab the lock that says we are going to process all the refs for
|
|
|
|
* this head
|
|
|
|
*/
|
2018-10-11 13:40:34 +08:00
|
|
|
ret = btrfs_delayed_ref_lock(delayed_refs, head);
|
2018-08-15 15:39:54 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We may have dropped the spin lock to get the head mutex lock, and
|
|
|
|
* that might have given someone else time to free the head. If that's
|
|
|
|
* true, it has been removed from our list and we can move on.
|
|
|
|
*/
|
|
|
|
if (ret == -EAGAIN)
|
|
|
|
head = ERR_PTR(-EAGAIN);
|
|
|
|
|
|
|
|
return head;
|
|
|
|
}
|
|
|
|
|
2018-08-15 15:39:55 +08:00
|
|
|
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_delayed_ref_head *locked_ref,
|
|
|
|
unsigned long *run_refs)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
struct btrfs_delayed_extent_op *extent_op;
|
|
|
|
struct btrfs_delayed_ref_node *ref;
|
|
|
|
int must_insert_reserved = 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
|
|
|
2018-08-15 15:39:56 +08:00
|
|
|
lockdep_assert_held(&locked_ref->mutex);
|
|
|
|
lockdep_assert_held(&locked_ref->lock);
|
|
|
|
|
2018-08-15 15:39:55 +08:00
|
|
|
while ((ref = select_delayed_ref(locked_ref))) {
|
|
|
|
if (ref->seq &&
|
|
|
|
btrfs_check_delayed_seq(fs_info, ref->seq)) {
|
|
|
|
spin_unlock(&locked_ref->lock);
|
|
|
|
unselect_delayed_ref_head(delayed_refs, locked_ref);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
(*run_refs)++;
|
|
|
|
ref->in_tree = 0;
|
|
|
|
rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
|
|
|
|
RB_CLEAR_NODE(&ref->ref_node);
|
|
|
|
if (!list_empty(&ref->add_list))
|
|
|
|
list_del(&ref->add_list);
|
|
|
|
/*
|
|
|
|
* When we play the delayed ref, also correct the ref_mod on
|
|
|
|
* head
|
|
|
|
*/
|
|
|
|
switch (ref->action) {
|
|
|
|
case BTRFS_ADD_DELAYED_REF:
|
|
|
|
case BTRFS_ADD_DELAYED_EXTENT:
|
|
|
|
locked_ref->ref_mod -= ref->ref_mod;
|
|
|
|
break;
|
|
|
|
case BTRFS_DROP_DELAYED_REF:
|
|
|
|
locked_ref->ref_mod += ref->ref_mod;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON(1);
|
|
|
|
}
|
|
|
|
atomic_dec(&delayed_refs->num_entries);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Record the must_insert_reserved flag before we drop the
|
|
|
|
* spin lock.
|
|
|
|
*/
|
|
|
|
must_insert_reserved = locked_ref->must_insert_reserved;
|
|
|
|
locked_ref->must_insert_reserved = 0;
|
|
|
|
|
|
|
|
extent_op = locked_ref->extent_op;
|
|
|
|
locked_ref->extent_op = NULL;
|
|
|
|
spin_unlock(&locked_ref->lock);
|
|
|
|
|
|
|
|
ret = run_one_delayed_ref(trans, ref, extent_op,
|
|
|
|
must_insert_reserved);
|
|
|
|
|
|
|
|
btrfs_free_delayed_extent_op(extent_op);
|
|
|
|
if (ret) {
|
|
|
|
unselect_delayed_ref_head(delayed_refs, locked_ref);
|
|
|
|
btrfs_put_delayed_ref(ref);
|
|
|
|
btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
|
|
|
|
ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_put_delayed_ref(ref);
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
spin_lock(&locked_ref->lock);
|
|
|
|
btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
/*
|
|
|
|
* Returns 0 on success or if called with an already aborted transaction.
|
|
|
|
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
|
|
|
|
*/
|
2014-01-23 22:21:38 +08:00
|
|
|
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
|
|
|
|
unsigned long nr)
|
2009-03-13 22:10:06 +08:00
|
|
|
{
|
2018-03-15 22:00:27 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2009-03-13 22:10:06 +08:00
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
struct btrfs_delayed_ref_head *locked_ref = NULL;
|
2014-01-23 23:54:11 +08:00
|
|
|
ktime_t start = ktime_get();
|
2009-03-13 22:10:06 +08:00
|
|
|
int ret;
|
2014-01-23 22:21:38 +08:00
|
|
|
unsigned long count = 0;
|
2014-01-23 23:54:11 +08:00
|
|
|
unsigned long actual_count = 0;
|
2009-03-13 22:10:06 +08:00
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
2018-08-15 15:39:56 +08:00
|
|
|
do {
|
2009-03-13 22:10:06 +08:00
|
|
|
if (!locked_ref) {
|
2018-08-15 15:39:54 +08:00
|
|
|
locked_ref = btrfs_obtain_ref_head(trans);
|
2018-08-15 15:39:56 +08:00
|
|
|
if (IS_ERR_OR_NULL(locked_ref)) {
|
|
|
|
if (PTR_ERR(locked_ref) == -EAGAIN) {
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
2018-08-15 15:39:56 +08:00
|
|
|
count++;
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
Btrfs: fix regression when running delayed references
In the kernel 4.2 merge window we had a refactoring/rework of the delayed
references implementation in order to fix certain problems with qgroups.
However that rework introduced one more regression that leads to the
following trace when running delayed references for metadata:
[35908.064664] kernel BUG at fs/btrfs/extent-tree.c:1832!
[35908.065201] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[35908.065201] Modules linked in: dm_flakey dm_mod btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc psmouse i2
[35908.065201] CPU: 14 PID: 15014 Comm: kworker/u32:9 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[35908.065201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[35908.065201] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
[35908.065201] task: ffff880114b7d780 ti: ffff88010c4c8000 task.ti: ffff88010c4c8000
[35908.065201] RIP: 0010:[<ffffffffa04928b5>] [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
[35908.065201] RSP: 0018:ffff88010c4cbb08 EFLAGS: 00010293
[35908.065201] RAX: 0000000000000000 RBX: ffff88008a661000 RCX: 0000000000000000
[35908.065201] RDX: ffffffffa04dd58f RSI: 0000000000000001 RDI: 0000000000000000
[35908.065201] RBP: ffff88010c4cbb40 R08: 0000000000001000 R09: ffff88010c4cb9f8
[35908.065201] R10: 0000000000000000 R11: 000000000000002c R12: 0000000000000000
[35908.065201] R13: ffff88020a74c578 R14: 0000000000000000 R15: 0000000000000000
[35908.065201] FS: 0000000000000000(0000) GS:ffff88023edc0000(0000) knlGS:0000000000000000
[35908.065201] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[35908.065201] CR2: 00000000015e8708 CR3: 0000000102185000 CR4: 00000000000006e0
[35908.065201] Stack:
[35908.065201] ffff88010c4cbb18 0000000000000f37 ffff88020a74c578 ffff88015a408000
[35908.065201] ffff880154a44000 0000000000000000 0000000000000005 ffff88010c4cbbd8
[35908.065201] ffffffffa0492b9a 0000000000000005 0000000000000000 0000000000000000
[35908.065201] Call Trace:
[35908.065201] [<ffffffffa0492b9a>] __btrfs_inc_extent_ref+0x8b/0x208 [btrfs]
[35908.065201] [<ffffffffa0497117>] ? __btrfs_run_delayed_refs+0x4d4/0xd33 [btrfs]
[35908.065201] [<ffffffffa049773d>] __btrfs_run_delayed_refs+0xafa/0xd33 [btrfs]
[35908.065201] [<ffffffffa04a976a>] ? join_transaction.isra.10+0x25/0x41f [btrfs]
[35908.065201] [<ffffffffa04a97ed>] ? join_transaction.isra.10+0xa8/0x41f [btrfs]
[35908.065201] [<ffffffffa049914d>] btrfs_run_delayed_refs+0x75/0x1dd [btrfs]
[35908.065201] [<ffffffffa04992f1>] delayed_ref_async_start+0x3c/0x7b [btrfs]
[35908.065201] [<ffffffffa04d4b4f>] normal_work_helper+0x14c/0x32a [btrfs]
[35908.065201] [<ffffffffa04d4e93>] btrfs_extent_refs_helper+0x12/0x14 [btrfs]
[35908.065201] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[35908.065201] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[35908.065201] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[35908.065201] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[35908.065201] [<ffffffff8106904d>] kthread+0xef/0xf7
[35908.065201] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[35908.065201] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[35908.065201] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[35908.065201] Code: 6a 01 41 56 41 54 ff 75 10 41 51 4d 89 c1 49 89 c8 48 8d 4d d0 e8 f6 f1 ff ff 48 83 c4 28 85 c0 75 2c 49 81 fc ff 00 00 00 77 02 <0f> 0b 4c 8b 45 30 8b 4d 28 45 31
[35908.065201] RIP [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
[35908.065201] RSP <ffff88010c4cbb08>
[35908.310885] ---[ end trace fe4299baf0666457 ]---
This happens because the new delayed references code no longer merges
delayed references that have different sequence values. The following
steps are an example sequence leading to this issue:
1) Transaction N starts, fs_info->tree_mod_seq has value 0;
2) Extent buffer (btree node) A is allocated, delayed reference Ref1 for
bytenr A is created, with a value of 1 and a seq value of 0;
3) fs_info->tree_mod_seq is incremented to 1;
4) Extent buffer A is deleted through btrfs_del_items(), which calls
btrfs_del_leaf(), which in turn calls btrfs_free_tree_block(). The
later returns the metadata extent associated to extent buffer A to
the free space cache (the range is not pinned), because the extent
buffer was created in the current transaction (N) and writeback never
happened for the extent buffer (flag BTRFS_HEADER_FLAG_WRITTEN not set
in the extent buffer).
This creates the delayed reference Ref2 for bytenr A, with a value
of -1 and a seq value of 1;
5) Delayed reference Ref2 is not merged with Ref1 when we create it,
because they have different sequence numbers (decided at
add_delayed_ref_tail_merge());
6) fs_info->tree_mod_seq is incremented to 2;
7) Some task attempts to allocate a new extent buffer (done at
extent-tree.c:find_free_extent()), but due to heavy fragmentation
and running low on metadata space the clustered allocation fails
and we fall back to unclustered allocation, which finds the
extent at offset A, so a new extent buffer at offset A is allocated.
This creates delayed reference Ref3 for bytenr A, with a value of 1
and a seq value of 2;
8) Ref3 is not merged neither with Ref2 nor Ref1, again because they
all have different seq values;
9) We start running the delayed references (__btrfs_run_delayed_refs());
10) The delayed Ref1 is the first one being applied, which ends up
creating an inline extent backref in the extent tree;
10) Next the delayed reference Ref3 is selected for execution, and not
Ref2, because select_delayed_ref() always gives a preference for
positive references (that have an action of BTRFS_ADD_DELAYED_REF);
11) When running Ref3 we encounter alreay the inline extent backref
in the extent tree at insert_inline_extent_backref(), which makes
us hit the following BUG_ON:
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
This is always true because owner corresponds to the level of the
extent buffer/btree node in the btree.
For the scenario described above we hit the BUG_ON because we never merge
references that have different seq values.
We used to do the merging before the 4.2 kernel, more specifically, before
the commmits:
c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
c43d160fcd5e ("btrfs: delayed-ref: Cleanup the unneeded functions.")
This issue became more exposed after the following change that was added
to 4.2 as well:
cffc3374e567 ("Btrfs: fix order by which delayed references are run")
Which in turn fixed another regression by the two commits previously
mentioned.
So fix this by bringing back the delayed reference merge code, with the
proper adaptations so that it operates against the new data structure
(linked list vs old red black tree implementation).
This issue was hit running fstest btrfs/063 in a loop. Several people have
reported this issue in the mailing list when running on kernels 4.2+.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2015-10-22 16:47:34 +08:00
|
|
|
/*
|
|
|
|
* We need to try and merge add/drops of the same ref since we
|
|
|
|
* can run into issues with relocate dropping the implicit ref
|
|
|
|
* and then it being added back again before the drop can
|
|
|
|
* finish. If we merged anything we need to re-loop so we can
|
|
|
|
* get a good ref.
|
|
|
|
* Or we can get node references of the same type that weren't
|
|
|
|
* merged when created due to bumps in the tree mod seq, and
|
|
|
|
* we need to merge them to prevent adding an inline extent
|
|
|
|
* backref before dropping it (triggering a BUG_ON at
|
|
|
|
* insert_inline_extent_backref()).
|
|
|
|
*/
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_lock(&locked_ref->lock);
|
2018-04-19 16:06:39 +08:00
|
|
|
btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
|
2012-08-08 04:00:32 +08:00
|
|
|
|
2018-08-15 15:39:56 +08:00
|
|
|
ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
|
|
|
|
&actual_count);
|
|
|
|
if (ret < 0 && ret != -EAGAIN) {
|
|
|
|
/*
|
|
|
|
* Error, btrfs_run_delayed_refs_for_head already
|
|
|
|
* unlocked everything so just bail out
|
|
|
|
*/
|
|
|
|
return ret;
|
|
|
|
} else if (!ret) {
|
|
|
|
/*
|
|
|
|
* Success, perform the usual cleanup of a processed
|
|
|
|
* head
|
|
|
|
*/
|
2018-06-20 20:49:03 +08:00
|
|
|
ret = cleanup_ref_head(trans, locked_ref);
|
2017-09-30 03:43:54 +08:00
|
|
|
if (ret > 0 ) {
|
2017-09-30 03:43:53 +08:00
|
|
|
/* We dropped our lock, we need to loop. */
|
|
|
|
ret = 0;
|
2014-01-23 22:21:38 +08:00
|
|
|
continue;
|
2017-09-30 03:43:54 +08:00
|
|
|
} else if (ret) {
|
|
|
|
return ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
2012-08-09 14:16:53 +08:00
|
|
|
}
|
2017-09-30 03:43:55 +08:00
|
|
|
|
2017-09-30 03:43:53 +08:00
|
|
|
/*
|
2018-08-15 15:39:56 +08:00
|
|
|
* Either success case or btrfs_run_delayed_refs_for_head
|
|
|
|
* returned -EAGAIN, meaning we need to select another head
|
2017-09-30 03:43:53 +08:00
|
|
|
*/
|
|
|
|
|
2018-08-15 15:39:56 +08:00
|
|
|
locked_ref = NULL;
|
2009-03-13 22:17:05 +08:00
|
|
|
cond_resched();
|
2018-08-15 15:39:56 +08:00
|
|
|
} while ((nr != -1 && count < nr) || locked_ref);
|
2014-01-23 23:54:11 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want to include ref heads since we can have empty ref heads
|
|
|
|
* and those will drastically skew our runtime down since we just do
|
|
|
|
* accounting, no actual extent tree updates.
|
|
|
|
*/
|
|
|
|
if (actual_count > 0) {
|
|
|
|
u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
|
|
|
|
u64 avg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We weigh the current average higher than our current runtime
|
|
|
|
* to avoid large swings in the average.
|
|
|
|
*/
|
|
|
|
spin_lock(&delayed_refs->lock);
|
|
|
|
avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
|
2015-01-17 00:21:12 +08:00
|
|
|
fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
|
2014-01-23 23:54:11 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
}
|
2014-01-23 22:21:38 +08:00
|
|
|
return 0;
|
2009-03-13 22:17:05 +08:00
|
|
|
}
|
|
|
|
|
2011-09-12 18:22:57 +08:00
|
|
|
#ifdef SCRAMBLE_DELAYED_REFS
|
|
|
|
/*
|
|
|
|
* Normally delayed refs get processed in ascending bytenr order. This
|
|
|
|
* correlates in most cases to the order added. To expose dependencies on this
|
|
|
|
* order, we start to process the tree in the middle instead of the beginning
|
|
|
|
*/
|
|
|
|
static u64 find_middle(struct rb_root *root)
|
|
|
|
{
|
|
|
|
struct rb_node *n = root->rb_node;
|
|
|
|
struct btrfs_delayed_ref_node *entry;
|
|
|
|
int alt = 1;
|
|
|
|
u64 middle;
|
|
|
|
u64 first = 0, last = 0;
|
|
|
|
|
|
|
|
n = rb_first(root);
|
|
|
|
if (n) {
|
|
|
|
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
|
|
|
|
first = entry->bytenr;
|
|
|
|
}
|
|
|
|
n = rb_last(root);
|
|
|
|
if (n) {
|
|
|
|
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
|
|
|
|
last = entry->bytenr;
|
|
|
|
}
|
|
|
|
n = root->rb_node;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
|
|
|
|
WARN_ON(!entry->in_tree);
|
|
|
|
|
|
|
|
middle = entry->bytenr;
|
|
|
|
|
|
|
|
if (alt)
|
|
|
|
n = n->rb_left;
|
|
|
|
else
|
|
|
|
n = n->rb_right;
|
|
|
|
|
|
|
|
alt = 1 - alt;
|
|
|
|
}
|
|
|
|
return middle;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
|
2013-06-13 01:56:06 +08:00
|
|
|
{
|
|
|
|
u64 num_bytes;
|
|
|
|
|
|
|
|
num_bytes = heads * (sizeof(struct btrfs_extent_item) +
|
|
|
|
sizeof(struct btrfs_extent_inline_ref));
|
2016-06-23 06:54:23 +08:00
|
|
|
if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
|
2013-06-13 01:56:06 +08:00
|
|
|
num_bytes += heads * sizeof(struct btrfs_tree_block_info);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't ever fill up leaves all the way so multiply by 2 just to be
|
2016-05-20 09:18:45 +08:00
|
|
|
* closer to what we're really going to want to use.
|
2013-06-13 01:56:06 +08:00
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
|
2013-06-13 01:56:06 +08:00
|
|
|
}
|
|
|
|
|
2015-02-03 23:50:16 +08:00
|
|
|
/*
|
|
|
|
* Takes the number of bytes to be csumm'ed and figures out how many leaves it
|
|
|
|
* would require to store the csums for that many bytes.
|
|
|
|
*/
|
2016-06-23 06:54:24 +08:00
|
|
|
u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
|
2015-02-03 23:50:16 +08:00
|
|
|
{
|
|
|
|
u64 csum_size;
|
|
|
|
u64 num_csums_per_leaf;
|
|
|
|
u64 num_csums;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
|
2015-02-03 23:50:16 +08:00
|
|
|
num_csums_per_leaf = div64_u64(csum_size,
|
2016-06-23 06:54:23 +08:00
|
|
|
(u64)btrfs_super_csum_size(fs_info->super_copy));
|
|
|
|
num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
|
2015-02-03 23:50:16 +08:00
|
|
|
num_csums += num_csums_per_leaf - 1;
|
|
|
|
num_csums = div64_u64(num_csums, num_csums_per_leaf);
|
|
|
|
return num_csums;
|
|
|
|
}
|
|
|
|
|
2009-03-13 22:17:05 +08:00
|
|
|
/*
|
|
|
|
* this starts processing the delayed reference count updates and
|
|
|
|
* extent insertions we have queued up so far. count can be
|
|
|
|
* 0, which means to process everything in the tree at the start
|
|
|
|
* of the run (but not newly added entries), or it can be some target
|
|
|
|
* number you'd like to process.
|
2012-03-12 23:03:00 +08:00
|
|
|
*
|
|
|
|
* Returns 0 on success or if called with an aborted transaction
|
|
|
|
* Returns <0 on error and aborts the transaction
|
2009-03-13 22:17:05 +08:00
|
|
|
*/
|
|
|
|
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
|
2018-03-15 23:27:37 +08:00
|
|
|
unsigned long count)
|
2009-03-13 22:17:05 +08:00
|
|
|
{
|
2018-03-15 23:27:37 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2009-03-13 22:17:05 +08:00
|
|
|
struct rb_node *node;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
2013-10-14 12:59:45 +08:00
|
|
|
struct btrfs_delayed_ref_head *head;
|
2009-03-13 22:17:05 +08:00
|
|
|
int ret;
|
|
|
|
int run_all = count == (unsigned long)-1;
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
/* We'll clean this up in btrfs_cleanup_transaction */
|
|
|
|
if (trans->aborted)
|
|
|
|
return 0;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
|
2015-12-30 23:52:35 +08:00
|
|
|
return 0;
|
|
|
|
|
2009-03-13 22:17:05 +08:00
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
2014-12-17 16:14:09 +08:00
|
|
|
if (count == 0)
|
2014-01-23 22:21:38 +08:00
|
|
|
count = atomic_read(&delayed_refs->num_entries) * 2;
|
2013-01-30 07:44:12 +08:00
|
|
|
|
2009-03-13 22:17:05 +08:00
|
|
|
again:
|
2011-09-12 18:22:57 +08:00
|
|
|
#ifdef SCRAMBLE_DELAYED_REFS
|
|
|
|
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
|
|
|
|
#endif
|
2018-03-15 22:00:27 +08:00
|
|
|
ret = __btrfs_run_delayed_refs(trans, count);
|
2014-01-23 22:21:38 +08:00
|
|
|
if (ret < 0) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2014-01-23 22:21:38 +08:00
|
|
|
return ret;
|
2009-02-12 22:27:38 +08:00
|
|
|
}
|
2009-03-13 22:17:05 +08:00
|
|
|
|
2009-03-13 22:10:06 +08:00
|
|
|
if (run_all) {
|
2018-11-22 03:05:42 +08:00
|
|
|
btrfs_create_pending_block_groups(trans);
|
2012-09-12 04:57:25 +08:00
|
|
|
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_lock(&delayed_refs->lock);
|
2018-08-23 03:51:49 +08:00
|
|
|
node = rb_first_cached(&delayed_refs->href_root);
|
2014-01-23 22:21:38 +08:00
|
|
|
if (!node) {
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
2009-03-13 22:10:06 +08:00
|
|
|
goto out;
|
2014-01-23 22:21:38 +08:00
|
|
|
}
|
2017-09-30 03:43:57 +08:00
|
|
|
head = rb_entry(node, struct btrfs_delayed_ref_head,
|
|
|
|
href_node);
|
|
|
|
refcount_inc(&head->refs);
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
2007-08-11 02:06:19 +08:00
|
|
|
|
2017-09-30 03:43:57 +08:00
|
|
|
/* Mutex was contended, block until it's released and retry. */
|
|
|
|
mutex_lock(&head->mutex);
|
|
|
|
mutex_unlock(&head->mutex);
|
2009-03-13 22:10:06 +08:00
|
|
|
|
2017-09-30 03:43:57 +08:00
|
|
|
btrfs_put_delayed_ref_head(head);
|
2014-01-23 22:21:38 +08:00
|
|
|
cond_resched();
|
2009-03-13 22:10:06 +08:00
|
|
|
goto again;
|
2007-10-16 04:14:19 +08:00
|
|
|
}
|
2007-06-23 02:16:25 +08:00
|
|
|
out:
|
2007-03-07 09:08:01 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
|
|
|
|
u64 bytenr, u64 num_bytes, u64 flags,
|
2013-05-10 01:49:30 +08:00
|
|
|
int level, int is_data)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
|
|
|
struct btrfs_delayed_extent_op *extent_op;
|
|
|
|
int ret;
|
|
|
|
|
2012-11-21 10:21:28 +08:00
|
|
|
extent_op = btrfs_alloc_delayed_extent_op();
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (!extent_op)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
extent_op->flags_to_set = flags;
|
2015-11-30 23:51:29 +08:00
|
|
|
extent_op->update_flags = true;
|
|
|
|
extent_op->update_key = false;
|
|
|
|
extent_op->is_data = is_data ? true : false;
|
2013-05-10 01:49:30 +08:00
|
|
|
extent_op->level = level;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2019-03-20 18:42:34 +08:00
|
|
|
ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (ret)
|
2012-11-21 10:21:28 +08:00
|
|
|
btrfs_free_delayed_extent_op(extent_op);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-01-31 04:25:28 +08:00
|
|
|
static noinline int check_delayed_ref(struct btrfs_root *root,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 objectid, u64 offset, u64 bytenr)
|
|
|
|
{
|
|
|
|
struct btrfs_delayed_ref_head *head;
|
|
|
|
struct btrfs_delayed_ref_node *ref;
|
|
|
|
struct btrfs_delayed_data_ref *data_ref;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
2017-01-31 04:25:28 +08:00
|
|
|
struct btrfs_transaction *cur_trans;
|
2017-10-20 02:16:00 +08:00
|
|
|
struct rb_node *node;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2018-04-29 15:59:42 +08:00
|
|
|
spin_lock(&root->fs_info->trans_lock);
|
2017-01-31 04:25:28 +08:00
|
|
|
cur_trans = root->fs_info->running_transaction;
|
2018-04-29 15:59:42 +08:00
|
|
|
if (cur_trans)
|
|
|
|
refcount_inc(&cur_trans->use_count);
|
|
|
|
spin_unlock(&root->fs_info->trans_lock);
|
2017-01-31 04:25:28 +08:00
|
|
|
if (!cur_trans)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
delayed_refs = &cur_trans->delayed_refs;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
spin_lock(&delayed_refs->lock);
|
2017-01-31 04:24:37 +08:00
|
|
|
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
|
2014-01-23 22:21:38 +08:00
|
|
|
if (!head) {
|
|
|
|
spin_unlock(&delayed_refs->lock);
|
2018-04-29 15:59:42 +08:00
|
|
|
btrfs_put_transaction(cur_trans);
|
2014-01-23 22:21:38 +08:00
|
|
|
return 0;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
if (!mutex_trylock(&head->mutex)) {
|
2017-09-30 03:43:57 +08:00
|
|
|
refcount_inc(&head->refs);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2011-05-02 21:29:25 +08:00
|
|
|
/*
|
|
|
|
* Mutex was contended, block until it's released and let
|
|
|
|
* caller try again
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
mutex_lock(&head->mutex);
|
|
|
|
mutex_unlock(&head->mutex);
|
2017-09-30 03:43:57 +08:00
|
|
|
btrfs_put_delayed_ref_head(head);
|
2018-04-29 15:59:42 +08:00
|
|
|
btrfs_put_transaction(cur_trans);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return -EAGAIN;
|
|
|
|
}
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_lock(&head->lock);
|
2017-10-20 02:16:00 +08:00
|
|
|
/*
|
|
|
|
* XXX: We should replace this with a proper search function in the
|
|
|
|
* future.
|
|
|
|
*/
|
2018-08-23 03:51:50 +08:00
|
|
|
for (node = rb_first_cached(&head->ref_tree); node;
|
|
|
|
node = rb_next(node)) {
|
2017-10-20 02:16:00 +08:00
|
|
|
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
|
2014-01-23 22:21:38 +08:00
|
|
|
/* If it's a shared ref we know a cross reference exists */
|
|
|
|
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2014-01-23 22:21:38 +08:00
|
|
|
data_ref = btrfs_delayed_node_to_data_ref(ref);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2014-01-23 22:21:38 +08:00
|
|
|
/*
|
|
|
|
* If our ref doesn't match the one we're currently looking at
|
|
|
|
* then we have a cross reference.
|
|
|
|
*/
|
|
|
|
if (data_ref->root != root->root_key.objectid ||
|
|
|
|
data_ref->objectid != objectid ||
|
|
|
|
data_ref->offset != offset) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_unlock(&head->lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
mutex_unlock(&head->mutex);
|
2018-04-29 15:59:42 +08:00
|
|
|
btrfs_put_transaction(cur_trans);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-01-31 04:25:28 +08:00
|
|
|
static noinline int check_committed_ref(struct btrfs_root *root,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_path *path,
|
|
|
|
u64 objectid, u64 offset, u64 bytenr)
|
2007-12-18 09:14:01 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
2008-07-30 21:26:11 +08:00
|
|
|
struct extent_buffer *leaf;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_extent_data_ref *ref;
|
|
|
|
struct btrfs_extent_inline_ref *iref;
|
|
|
|
struct btrfs_extent_item *ei;
|
2008-07-30 21:26:11 +08:00
|
|
|
struct btrfs_key key;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u32 item_size;
|
2017-08-19 05:15:19 +08:00
|
|
|
int type;
|
2007-12-18 09:14:01 +08:00
|
|
|
int ret;
|
2008-06-26 04:01:30 +08:00
|
|
|
|
2007-12-18 09:14:01 +08:00
|
|
|
key.objectid = bytenr;
|
2008-09-24 01:14:14 +08:00
|
|
|
key.offset = (u64)-1;
|
2008-07-30 21:26:11 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
2007-12-18 09:14:01 +08:00
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret == 0); /* Corruption */
|
2008-10-31 02:20:02 +08:00
|
|
|
|
|
|
|
ret = -ENOENT;
|
|
|
|
if (path->slots[0] == 0)
|
2008-09-24 01:14:14 +08:00
|
|
|
goto out;
|
2007-12-18 09:14:01 +08:00
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
path->slots[0]--;
|
2008-07-30 21:26:11 +08:00
|
|
|
leaf = path->nodes[0];
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
2007-12-18 09:14:01 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
|
2007-12-18 09:14:01 +08:00
|
|
|
goto out;
|
2008-07-30 21:26:11 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ret = 1;
|
|
|
|
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
2008-01-04 02:23:19 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (item_size != sizeof(*ei) +
|
|
|
|
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
|
|
|
|
goto out;
|
2007-12-18 09:14:01 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (btrfs_extent_generation(leaf, ei) <=
|
|
|
|
btrfs_root_last_snapshot(&root->root_item))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
|
2017-08-19 05:15:19 +08:00
|
|
|
|
|
|
|
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
|
|
|
|
if (type != BTRFS_EXTENT_DATA_REF_KEY)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
|
|
|
|
if (btrfs_extent_refs(leaf, ei) !=
|
|
|
|
btrfs_extent_data_ref_count(leaf, ref) ||
|
|
|
|
btrfs_extent_data_ref_root(leaf, ref) !=
|
|
|
|
root->root_key.objectid ||
|
|
|
|
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
|
|
|
|
btrfs_extent_data_ref_offset(leaf, ref) != offset)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-01-31 04:25:28 +08:00
|
|
|
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
|
|
|
|
u64 bytenr)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
2018-05-30 14:49:10 +08:00
|
|
|
return -ENOMEM;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
do {
|
2017-01-31 04:25:28 +08:00
|
|
|
ret = check_committed_ref(root, path, objectid,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
offset, bytenr);
|
|
|
|
if (ret && ret != -ENOENT)
|
2008-07-30 21:26:11 +08:00
|
|
|
goto out;
|
2008-10-31 02:20:02 +08:00
|
|
|
|
2018-08-30 09:59:16 +08:00
|
|
|
ret = check_delayed_ref(root, path, objectid, offset, bytenr);
|
|
|
|
} while (ret == -EAGAIN);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2007-12-18 09:14:01 +08:00
|
|
|
out:
|
2008-10-31 02:20:02 +08:00
|
|
|
btrfs_free_path(path);
|
2010-05-16 22:46:25 +08:00
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
|
|
|
|
WARN_ON(ret > 0);
|
2008-07-30 21:26:11 +08:00
|
|
|
return ret;
|
2007-12-18 09:14:01 +08:00
|
|
|
}
|
2007-04-10 21:27:04 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
|
2009-02-04 22:23:45 +08:00
|
|
|
struct btrfs_root *root,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct extent_buffer *buf,
|
2014-07-03 01:54:25 +08:00
|
|
|
int full_backref, int inc)
|
2008-09-24 01:14:14 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-09-24 01:14:14 +08:00
|
|
|
u64 bytenr;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u64 num_bytes;
|
|
|
|
u64 parent;
|
2008-09-24 01:14:14 +08:00
|
|
|
u64 ref_root;
|
|
|
|
u32 nritems;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
2019-04-04 14:45:35 +08:00
|
|
|
struct btrfs_ref generic_ref = { 0 };
|
|
|
|
bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
|
2008-09-24 01:14:14 +08:00
|
|
|
int i;
|
2019-04-04 14:45:35 +08:00
|
|
|
int action;
|
2008-09-24 01:14:14 +08:00
|
|
|
int level;
|
|
|
|
int ret = 0;
|
2014-09-30 05:53:21 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2014-05-08 05:06:09 +08:00
|
|
|
return 0;
|
2014-09-30 05:53:21 +08:00
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
ref_root = btrfs_header_owner(buf);
|
|
|
|
nritems = btrfs_header_nritems(buf);
|
|
|
|
level = btrfs_header_level(buf);
|
|
|
|
|
2014-04-02 19:51:05 +08:00
|
|
|
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return 0;
|
2008-09-24 01:14:14 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (full_backref)
|
|
|
|
parent = buf->start;
|
|
|
|
else
|
|
|
|
parent = 0;
|
2019-04-04 14:45:35 +08:00
|
|
|
if (inc)
|
|
|
|
action = BTRFS_ADD_DELAYED_REF;
|
|
|
|
else
|
|
|
|
action = BTRFS_DROP_DELAYED_REF;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
for (i = 0; i < nritems; i++) {
|
2008-09-24 01:14:14 +08:00
|
|
|
if (level == 0) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_item_key_to_cpu(buf, &key, i);
|
2014-06-05 00:41:45 +08:00
|
|
|
if (key.type != BTRFS_EXTENT_DATA_KEY)
|
2008-09-24 01:14:14 +08:00
|
|
|
continue;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
fi = btrfs_item_ptr(buf, i,
|
2008-09-24 01:14:14 +08:00
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
if (btrfs_file_extent_type(buf, fi) ==
|
|
|
|
BTRFS_FILE_EXTENT_INLINE)
|
|
|
|
continue;
|
|
|
|
bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
|
|
|
|
if (bytenr == 0)
|
|
|
|
continue;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
|
|
|
|
key.offset -= btrfs_file_extent_offset(buf, fi);
|
2019-04-04 14:45:35 +08:00
|
|
|
btrfs_init_generic_ref(&generic_ref, action, bytenr,
|
|
|
|
num_bytes, parent);
|
|
|
|
generic_ref.real_root = root->root_key.objectid;
|
|
|
|
btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
|
|
|
|
key.offset);
|
|
|
|
generic_ref.skip_qgroup = for_reloc;
|
2019-04-04 14:45:30 +08:00
|
|
|
if (inc)
|
2019-04-04 14:45:35 +08:00
|
|
|
ret = btrfs_inc_extent_ref(trans, &generic_ref);
|
2019-04-04 14:45:30 +08:00
|
|
|
else
|
2019-04-04 14:45:36 +08:00
|
|
|
ret = btrfs_free_extent(trans, &generic_ref);
|
2008-09-24 01:14:14 +08:00
|
|
|
if (ret)
|
|
|
|
goto fail;
|
|
|
|
} else {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
bytenr = btrfs_node_blockptr(buf, i);
|
2016-06-23 06:54:23 +08:00
|
|
|
num_bytes = fs_info->nodesize;
|
2019-04-04 14:45:35 +08:00
|
|
|
btrfs_init_generic_ref(&generic_ref, action, bytenr,
|
|
|
|
num_bytes, parent);
|
|
|
|
generic_ref.real_root = root->root_key.objectid;
|
|
|
|
btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
|
|
|
|
generic_ref.skip_qgroup = for_reloc;
|
2019-04-04 14:45:30 +08:00
|
|
|
if (inc)
|
2019-04-04 14:45:35 +08:00
|
|
|
ret = btrfs_inc_extent_ref(trans, &generic_ref);
|
2019-04-04 14:45:30 +08:00
|
|
|
else
|
2019-04-04 14:45:36 +08:00
|
|
|
ret = btrfs_free_extent(trans, &generic_ref);
|
2008-09-24 01:14:14 +08:00
|
|
|
if (ret)
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
fail:
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
2014-07-03 01:54:25 +08:00
|
|
|
struct extent_buffer *buf, int full_backref)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2014-07-03 01:54:25 +08:00
|
|
|
return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
2014-07-03 01:54:25 +08:00
|
|
|
struct extent_buffer *buf, int full_backref)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2014-07-03 01:54:25 +08:00
|
|
|
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
|
2008-09-24 01:14:14 +08:00
|
|
|
}
|
|
|
|
|
2007-04-27 04:46:15 +08:00
|
|
|
static int write_one_cache_group(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_block_group_cache *cache)
|
|
|
|
{
|
2019-03-20 18:57:46 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2007-04-27 04:46:15 +08:00
|
|
|
int ret;
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
2007-10-16 04:14:19 +08:00
|
|
|
unsigned long bi;
|
|
|
|
struct extent_buffer *leaf;
|
2007-04-27 04:46:15 +08:00
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
|
2014-12-13 05:02:20 +08:00
|
|
|
if (ret) {
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
2007-06-23 02:16:25 +08:00
|
|
|
goto fail;
|
2014-12-13 05:02:20 +08:00
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-23 02:16:25 +08:00
|
|
|
fail:
|
2015-04-26 01:31:05 +08:00
|
|
|
btrfs_release_path(path);
|
2014-12-13 05:02:20 +08:00
|
|
|
return ret;
|
2007-04-27 04:46:15 +08:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-03-20 19:01:07 +08:00
|
|
|
static struct btrfs_block_group_cache *next_block_group(
|
|
|
|
struct btrfs_block_group_cache *cache)
|
2009-07-22 22:07:05 +08:00
|
|
|
{
|
2019-03-20 19:01:07 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
2009-07-22 22:07:05 +08:00
|
|
|
struct rb_node *node;
|
2014-11-26 23:28:50 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_lock(&fs_info->block_group_cache_lock);
|
2014-11-26 23:28:50 +08:00
|
|
|
|
|
|
|
/* If our block group was removed, we need a full search. */
|
|
|
|
if (RB_EMPTY_NODE(&cache->cache_node)) {
|
|
|
|
const u64 next_bytenr = cache->key.objectid + cache->key.offset;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_unlock(&fs_info->block_group_cache_lock);
|
2014-11-26 23:28:50 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2016-06-23 06:54:23 +08:00
|
|
|
cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
|
2014-11-26 23:28:50 +08:00
|
|
|
}
|
2009-07-22 22:07:05 +08:00
|
|
|
node = rb_next(&cache->cache_node);
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
if (node) {
|
|
|
|
cache = rb_entry(node, struct btrfs_block_group_cache,
|
|
|
|
cache_node);
|
2009-11-14 04:12:59 +08:00
|
|
|
btrfs_get_block_group(cache);
|
2009-07-22 22:07:05 +08:00
|
|
|
} else
|
|
|
|
cache = NULL;
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_unlock(&fs_info->block_group_cache_lock);
|
2009-07-22 22:07:05 +08:00
|
|
|
return cache;
|
|
|
|
}
|
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
static int cache_save_setup(struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path)
|
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = block_group->fs_info;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
2010-06-22 02:48:16 +08:00
|
|
|
struct inode *inode = NULL;
|
2017-02-27 15:10:38 +08:00
|
|
|
struct extent_changeset *data_reserved = NULL;
|
2010-06-22 02:48:16 +08:00
|
|
|
u64 alloc_hint = 0;
|
2010-12-04 02:17:53 +08:00
|
|
|
int dcs = BTRFS_DC_ERROR;
|
2015-01-17 00:21:12 +08:00
|
|
|
u64 num_pages = 0;
|
2010-06-22 02:48:16 +08:00
|
|
|
int retries = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this block group is smaller than 100 megs don't bother caching the
|
|
|
|
* block group.
|
|
|
|
*/
|
2015-12-15 00:42:10 +08:00
|
|
|
if (block_group->key.offset < (100 * SZ_1M)) {
|
2010-06-22 02:48:16 +08:00
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-02-12 22:43:51 +08:00
|
|
|
if (trans->aborted)
|
|
|
|
return 0;
|
2010-06-22 02:48:16 +08:00
|
|
|
again:
|
2019-03-20 20:40:19 +08:00
|
|
|
inode = lookup_free_space_inode(block_group, path);
|
2010-06-22 02:48:16 +08:00
|
|
|
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
|
|
|
|
ret = PTR_ERR(inode);
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2010-06-22 02:48:16 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
BUG_ON(retries);
|
|
|
|
retries++;
|
|
|
|
|
|
|
|
if (block_group->ro)
|
|
|
|
goto out_free;
|
|
|
|
|
2019-03-20 20:42:57 +08:00
|
|
|
ret = create_free_space_inode(trans, block_group, path);
|
2010-06-22 02:48:16 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_free;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We want to set the generation to 0, that way if anything goes wrong
|
|
|
|
* from here on out we know not to trust this cache when we load up next
|
|
|
|
* time.
|
|
|
|
*/
|
|
|
|
BTRFS_I(inode)->generation = 0;
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2015-02-12 22:43:51 +08:00
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* So theoretically we could recover from this, simply set the
|
|
|
|
* super cache generation to 0 so we know to invalidate the
|
|
|
|
* cache, but then we'd have to keep track of the block groups
|
|
|
|
* that fail this way so we know we _have_ to reset this cache
|
|
|
|
* before the next commit or risk reading stale cache. So to
|
|
|
|
* limit our exposure to horrible edge cases lets just abort the
|
|
|
|
* transaction, this only happens in really bad situations
|
|
|
|
* anyway.
|
|
|
|
*/
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2015-02-12 22:43:51 +08:00
|
|
|
goto out_put;
|
|
|
|
}
|
2010-06-22 02:48:16 +08:00
|
|
|
WARN_ON(ret);
|
|
|
|
|
2017-11-18 03:50:46 +08:00
|
|
|
/* We've already setup this transaction, go ahead and exit */
|
|
|
|
if (block_group->cache_generation == trans->transid &&
|
|
|
|
i_size_read(inode)) {
|
|
|
|
dcs = BTRFS_DC_SETUP;
|
|
|
|
goto out_put;
|
|
|
|
}
|
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
if (i_size_read(inode) > 0) {
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_check_trunc_cache_free_space(fs_info,
|
2016-06-23 06:54:23 +08:00
|
|
|
&fs_info->global_block_rsv);
|
2013-05-13 21:55:09 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_put;
|
|
|
|
|
2017-02-16 05:28:30 +08:00
|
|
|
ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
|
2010-06-22 02:48:16 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_put;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&block_group->lock);
|
Btrfs: fix a bug of writting free space cache during balance
Here is the whole story:
1)
A free space cache consists of two parts:
o free space cache inode, which is special becase it's stored in root tree.
o free space info, which is stored as the above inode's file data.
But we only build up another new inode and does not flush its free space info
onto disk when we _clear and setup_ free space cache, and this ends up with
that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN.
And holding DC_SETUP means that we will not truncate this free space cache inode,
which means the disk offset of its file extent will remain _unchanged_ at least
until next transaction finishes committing itself.
2)
We can set a block group readonly when we relocate the block group.
However,
if the readonly block group covers the disk offset where our free space cache
inode is going to write, it will force the free space cache inode into
cow_file_range() and it'll end up hitting a BUG_ON.
3)
Due to the above analysis, we fix this bug by adding the missing dirty flag.
4)
However, it's not over, there is still another case, nospace_cache.
With nospace_cache, we do not want to set dirty flag, instead we just truncate
free space cache inode and bail out with setting cache state DC_WRITTEN.
We can benifit from it since it saves us another 'pre-allocation' part which
usually costs a lot.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-07-06 17:31:34 +08:00
|
|
|
if (block_group->cached != BTRFS_CACHE_FINISHED ||
|
2016-06-23 06:54:23 +08:00
|
|
|
!btrfs_test_opt(fs_info, SPACE_CACHE)) {
|
Btrfs: fix a bug of writting free space cache during balance
Here is the whole story:
1)
A free space cache consists of two parts:
o free space cache inode, which is special becase it's stored in root tree.
o free space info, which is stored as the above inode's file data.
But we only build up another new inode and does not flush its free space info
onto disk when we _clear and setup_ free space cache, and this ends up with
that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN.
And holding DC_SETUP means that we will not truncate this free space cache inode,
which means the disk offset of its file extent will remain _unchanged_ at least
until next transaction finishes committing itself.
2)
We can set a block group readonly when we relocate the block group.
However,
if the readonly block group covers the disk offset where our free space cache
inode is going to write, it will force the free space cache inode into
cow_file_range() and it'll end up hitting a BUG_ON.
3)
Due to the above analysis, we fix this bug by adding the missing dirty flag.
4)
However, it's not over, there is still another case, nospace_cache.
With nospace_cache, we do not want to set dirty flag, instead we just truncate
free space cache inode and bail out with setting cache state DC_WRITTEN.
We can benifit from it since it saves us another 'pre-allocation' part which
usually costs a lot.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-07-06 17:31:34 +08:00
|
|
|
/*
|
|
|
|
* don't bother trying to write stuff out _if_
|
|
|
|
* a) we're not cached,
|
2017-03-07 05:49:02 +08:00
|
|
|
* b) we're with nospace_cache mount option,
|
|
|
|
* c) we're with v2 space_cache (FREE_SPACE_TREE).
|
Btrfs: fix a bug of writting free space cache during balance
Here is the whole story:
1)
A free space cache consists of two parts:
o free space cache inode, which is special becase it's stored in root tree.
o free space info, which is stored as the above inode's file data.
But we only build up another new inode and does not flush its free space info
onto disk when we _clear and setup_ free space cache, and this ends up with
that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN.
And holding DC_SETUP means that we will not truncate this free space cache inode,
which means the disk offset of its file extent will remain _unchanged_ at least
until next transaction finishes committing itself.
2)
We can set a block group readonly when we relocate the block group.
However,
if the readonly block group covers the disk offset where our free space cache
inode is going to write, it will force the free space cache inode into
cow_file_range() and it'll end up hitting a BUG_ON.
3)
Due to the above analysis, we fix this bug by adding the missing dirty flag.
4)
However, it's not over, there is still another case, nospace_cache.
With nospace_cache, we do not want to set dirty flag, instead we just truncate
free space cache inode and bail out with setting cache state DC_WRITTEN.
We can benifit from it since it saves us another 'pre-allocation' part which
usually costs a lot.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-07-06 17:31:34 +08:00
|
|
|
*/
|
2010-12-04 02:17:53 +08:00
|
|
|
dcs = BTRFS_DC_WRITTEN;
|
2010-06-22 02:48:16 +08:00
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
goto out_put;
|
|
|
|
}
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
|
2015-10-02 00:55:18 +08:00
|
|
|
/*
|
|
|
|
* We hit an ENOSPC when setting up the cache in this transaction, just
|
|
|
|
* skip doing the setup, we've already cleared the cache so we're safe.
|
|
|
|
*/
|
|
|
|
if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
|
|
|
|
ret = -ENOSPC;
|
|
|
|
goto out_put;
|
|
|
|
}
|
|
|
|
|
2012-08-07 03:46:38 +08:00
|
|
|
/*
|
|
|
|
* Try to preallocate enough space based on how big the block group is.
|
|
|
|
* Keep in mind this has to include any pinned space which could end up
|
|
|
|
* taking up quite a bit since it's not folded into the other space
|
|
|
|
* cache.
|
|
|
|
*/
|
2015-12-15 00:42:10 +08:00
|
|
|
num_pages = div_u64(block_group->key.offset, SZ_256M);
|
2010-06-22 02:48:16 +08:00
|
|
|
if (!num_pages)
|
|
|
|
num_pages = 1;
|
|
|
|
|
|
|
|
num_pages *= 16;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
num_pages *= PAGE_SIZE;
|
2010-06-22 02:48:16 +08:00
|
|
|
|
2017-02-27 15:10:38 +08:00
|
|
|
ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
|
2010-06-22 02:48:16 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_put;
|
|
|
|
|
|
|
|
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
|
|
|
|
num_pages, num_pages,
|
|
|
|
&alloc_hint);
|
2015-10-02 00:55:18 +08:00
|
|
|
/*
|
|
|
|
* Our cache requires contiguous chunks so that we don't modify a bunch
|
|
|
|
* of metadata or split extents when writing the cache out, which means
|
|
|
|
* we can enospc if we are heavily fragmented in addition to just normal
|
|
|
|
* out of space conditions. So if we hit this just skip setting up any
|
|
|
|
* other block groups for this transaction, maybe we'll unpin enough
|
|
|
|
* space the next time around.
|
|
|
|
*/
|
2010-12-04 02:17:53 +08:00
|
|
|
if (!ret)
|
|
|
|
dcs = BTRFS_DC_SETUP;
|
2015-10-02 00:55:18 +08:00
|
|
|
else if (ret == -ENOSPC)
|
|
|
|
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
|
2011-08-30 22:19:10 +08:00
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
out_put:
|
|
|
|
iput(inode);
|
|
|
|
out_free:
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2010-06-22 02:48:16 +08:00
|
|
|
out:
|
|
|
|
spin_lock(&block_group->lock);
|
2011-12-14 05:04:54 +08:00
|
|
|
if (!ret && dcs == BTRFS_DC_SETUP)
|
2011-10-06 20:58:24 +08:00
|
|
|
block_group->cache_generation = trans->transid;
|
2010-12-04 02:17:53 +08:00
|
|
|
block_group->disk_cache_state = dcs;
|
2010-06-22 02:48:16 +08:00
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
|
2017-02-27 15:10:38 +08:00
|
|
|
extent_changeset_free(data_reserved);
|
2010-06-22 02:48:16 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-03-20 19:02:55 +08:00
|
|
|
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
|
2015-03-03 05:37:31 +08:00
|
|
|
{
|
2019-03-20 19:02:55 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-03-03 05:37:31 +08:00
|
|
|
struct btrfs_block_group_cache *cache, *tmp;
|
|
|
|
struct btrfs_transaction *cur_trans = trans->transaction;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
|
|
|
|
if (list_empty(&cur_trans->dirty_bgs) ||
|
2016-06-23 06:54:23 +08:00
|
|
|
!btrfs_test_opt(fs_info, SPACE_CACHE))
|
2015-03-03 05:37:31 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Could add new block groups, use _safe just in case */
|
|
|
|
list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
|
|
|
|
dirty_list) {
|
|
|
|
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
|
|
|
|
cache_save_setup(cache, trans, path);
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
/*
|
|
|
|
* transaction commit does final block group cache writeback during a
|
|
|
|
* critical section where nothing is allowed to change the FS. This is
|
|
|
|
* required in order for the cache to actually match the block group,
|
|
|
|
* but can introduce a lot of latency into the commit.
|
|
|
|
*
|
|
|
|
* So, btrfs_start_dirty_block_groups is here to kick off block group
|
|
|
|
* cache IO. There's a chance we'll have to redo some of it if the
|
|
|
|
* block group changes again during the commit, but it greatly reduces
|
|
|
|
* the commit latency by getting rid of the easy block groups while
|
|
|
|
* we're still allowing others to join the commit.
|
|
|
|
*/
|
2018-02-07 23:55:41 +08:00
|
|
|
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
|
2007-04-27 04:46:15 +08:00
|
|
|
{
|
2018-02-07 23:55:41 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2009-07-22 22:07:05 +08:00
|
|
|
struct btrfs_block_group_cache *cache;
|
2014-11-18 04:45:48 +08:00
|
|
|
struct btrfs_transaction *cur_trans = trans->transaction;
|
|
|
|
int ret = 0;
|
2015-04-05 08:14:42 +08:00
|
|
|
int should_put;
|
2015-04-07 03:46:08 +08:00
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
LIST_HEAD(dirty);
|
|
|
|
struct list_head *io = &cur_trans->io_bgs;
|
2015-04-05 08:14:42 +08:00
|
|
|
int num_started = 0;
|
2015-04-07 03:46:08 +08:00
|
|
|
int loops = 0;
|
|
|
|
|
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
2015-04-26 01:29:16 +08:00
|
|
|
if (list_empty(&cur_trans->dirty_bgs)) {
|
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
|
|
|
return 0;
|
2015-04-07 03:46:08 +08:00
|
|
|
}
|
2015-04-26 01:29:16 +08:00
|
|
|
list_splice_init(&cur_trans->dirty_bgs, &dirty);
|
2015-04-07 03:46:08 +08:00
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
2014-11-18 04:45:48 +08:00
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
again:
|
|
|
|
/*
|
|
|
|
* make sure all the block groups on our dirty list actually
|
|
|
|
* exist
|
|
|
|
*/
|
2018-02-07 23:55:40 +08:00
|
|
|
btrfs_create_pending_block_groups(trans);
|
2015-04-07 03:46:08 +08:00
|
|
|
|
|
|
|
if (!path) {
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2015-04-26 01:29:16 +08:00
|
|
|
/*
|
|
|
|
* cache_write_mutex is here only to save us from balance or automatic
|
|
|
|
* removal of empty block groups deleting this block group while we are
|
|
|
|
* writing out the cache
|
|
|
|
*/
|
|
|
|
mutex_lock(&trans->transaction->cache_write_mutex);
|
2015-04-07 03:46:08 +08:00
|
|
|
while (!list_empty(&dirty)) {
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
bool drop_reserve = true;
|
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
cache = list_first_entry(&dirty,
|
|
|
|
struct btrfs_block_group_cache,
|
|
|
|
dirty_list);
|
|
|
|
/*
|
|
|
|
* this can happen if something re-dirties a block
|
|
|
|
* group that is already under IO. Just wait for it to
|
|
|
|
* finish and then do it all again
|
|
|
|
*/
|
|
|
|
if (!list_empty(&cache->io_list)) {
|
|
|
|
list_del_init(&cache->io_list);
|
2016-09-10 00:09:35 +08:00
|
|
|
btrfs_wait_cache_io(trans, cache, path);
|
2015-04-07 03:46:08 +08:00
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* btrfs_wait_cache_io uses the cache->dirty_list to decide
|
|
|
|
* if it should update the cache_state. Don't delete
|
|
|
|
* until after we wait.
|
|
|
|
*
|
|
|
|
* Since we're not running in the commit critical section
|
|
|
|
* we need the dirty_bgs_lock to protect from update_block_group
|
|
|
|
*/
|
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
|
|
|
list_del_init(&cache->dirty_list);
|
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
|
|
|
|
|
|
|
should_put = 1;
|
|
|
|
|
|
|
|
cache_save_setup(cache, trans, path);
|
|
|
|
|
|
|
|
if (cache->disk_cache_state == BTRFS_DC_SETUP) {
|
|
|
|
cache->io_ctl.inode = NULL;
|
2019-03-20 20:51:56 +08:00
|
|
|
ret = btrfs_write_out_cache(trans, cache, path);
|
2015-04-07 03:46:08 +08:00
|
|
|
if (ret == 0 && cache->io_ctl.inode) {
|
|
|
|
num_started++;
|
|
|
|
should_put = 0;
|
|
|
|
|
|
|
|
/*
|
2018-02-09 00:25:18 +08:00
|
|
|
* The cache_write_mutex is protecting the
|
|
|
|
* io_list, also refer to the definition of
|
|
|
|
* btrfs_transaction::io_bgs for more details
|
2015-04-07 03:46:08 +08:00
|
|
|
*/
|
|
|
|
list_add_tail(&cache->io_list, io);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* if we failed to write the cache, the
|
|
|
|
* generation will be bad and life goes on
|
|
|
|
*/
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
}
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
if (!ret) {
|
2019-03-20 18:57:46 +08:00
|
|
|
ret = write_one_cache_group(trans, path, cache);
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
/*
|
|
|
|
* Our block group might still be attached to the list
|
|
|
|
* of new block groups in the transaction handle of some
|
|
|
|
* other task (struct btrfs_trans_handle->new_bgs). This
|
|
|
|
* means its block group item isn't yet in the extent
|
|
|
|
* tree. If this happens ignore the error, as we will
|
|
|
|
* try again later in the critical section of the
|
|
|
|
* transaction commit.
|
|
|
|
*/
|
|
|
|
if (ret == -ENOENT) {
|
|
|
|
ret = 0;
|
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
|
|
|
if (list_empty(&cache->dirty_list)) {
|
|
|
|
list_add_tail(&cache->dirty_list,
|
|
|
|
&cur_trans->dirty_bgs);
|
|
|
|
btrfs_get_block_group(cache);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
drop_reserve = false;
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
|
|
|
} else if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
}
|
|
|
|
}
|
2015-04-07 03:46:08 +08:00
|
|
|
|
2018-11-28 19:05:13 +08:00
|
|
|
/* if it's not on the io list, we need to put the block group */
|
2015-04-07 03:46:08 +08:00
|
|
|
if (should_put)
|
|
|
|
btrfs_put_block_group(cache);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
if (drop_reserve)
|
|
|
|
btrfs_delayed_refs_rsv_release(fs_info, 1);
|
2015-04-07 03:46:08 +08:00
|
|
|
|
|
|
|
if (ret)
|
|
|
|
break;
|
2015-04-26 01:29:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid blocking other tasks for too long. It might even save
|
|
|
|
* us from writing caches for block groups that are going to be
|
|
|
|
* removed.
|
|
|
|
*/
|
|
|
|
mutex_unlock(&trans->transaction->cache_write_mutex);
|
|
|
|
mutex_lock(&trans->transaction->cache_write_mutex);
|
2015-04-07 03:46:08 +08:00
|
|
|
}
|
2015-04-26 01:29:16 +08:00
|
|
|
mutex_unlock(&trans->transaction->cache_write_mutex);
|
2015-04-07 03:46:08 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* go through delayed refs for all the stuff we've just kicked off
|
|
|
|
* and then loop back (just once)
|
|
|
|
*/
|
2018-03-15 23:27:37 +08:00
|
|
|
ret = btrfs_run_delayed_refs(trans, 0);
|
2015-04-07 03:46:08 +08:00
|
|
|
if (!ret && loops == 0) {
|
|
|
|
loops++;
|
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
|
|
|
list_splice_init(&cur_trans->dirty_bgs, &dirty);
|
2015-04-26 01:29:16 +08:00
|
|
|
/*
|
|
|
|
* dirty_bgs_lock protects us from concurrent block group
|
|
|
|
* deletes too (not just cache_write_mutex).
|
|
|
|
*/
|
|
|
|
if (!list_empty(&dirty)) {
|
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
|
|
|
goto again;
|
|
|
|
}
|
2015-04-07 03:46:08 +08:00
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
2016-07-21 08:44:12 +08:00
|
|
|
} else if (ret < 0) {
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
|
2015-04-07 03:46:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-03-20 19:04:08 +08:00
|
|
|
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
|
2015-04-07 03:46:08 +08:00
|
|
|
{
|
2019-03-20 19:04:08 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-04-07 03:46:08 +08:00
|
|
|
struct btrfs_block_group_cache *cache;
|
|
|
|
struct btrfs_transaction *cur_trans = trans->transaction;
|
|
|
|
int ret = 0;
|
|
|
|
int should_put;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct list_head *io = &cur_trans->io_bgs;
|
|
|
|
int num_started = 0;
|
2007-04-27 04:46:15 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2014-11-18 04:45:48 +08:00
|
|
|
/*
|
2015-12-18 11:02:48 +08:00
|
|
|
* Even though we are in the critical section of the transaction commit,
|
|
|
|
* we can still have concurrent tasks adding elements to this
|
|
|
|
* transaction's list of dirty block groups. These tasks correspond to
|
|
|
|
* endio free space workers started when writeback finishes for a
|
|
|
|
* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
|
|
|
|
* allocate new block groups as a result of COWing nodes of the root
|
|
|
|
* tree when updating the free space inode. The writeback for the space
|
|
|
|
* caches is triggered by an earlier call to
|
|
|
|
* btrfs_start_dirty_block_groups() and iterations of the following
|
|
|
|
* loop.
|
|
|
|
* Also we want to do the cache_save_setup first and then run the
|
2014-11-18 04:45:48 +08:00
|
|
|
* delayed refs to make sure we have the best chance at doing this all
|
|
|
|
* in one shot.
|
|
|
|
*/
|
2015-12-18 11:02:48 +08:00
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
2014-11-18 04:45:48 +08:00
|
|
|
while (!list_empty(&cur_trans->dirty_bgs)) {
|
|
|
|
cache = list_first_entry(&cur_trans->dirty_bgs,
|
|
|
|
struct btrfs_block_group_cache,
|
|
|
|
dirty_list);
|
2015-04-05 08:14:42 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this can happen if cache_save_setup re-dirties a block
|
|
|
|
* group that is already under IO. Just wait for it to
|
|
|
|
* finish and then do it all again
|
|
|
|
*/
|
|
|
|
if (!list_empty(&cache->io_list)) {
|
2015-12-18 11:02:48 +08:00
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
2015-04-05 08:14:42 +08:00
|
|
|
list_del_init(&cache->io_list);
|
2016-09-10 00:09:35 +08:00
|
|
|
btrfs_wait_cache_io(trans, cache, path);
|
2015-04-05 08:14:42 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2015-12-18 11:02:48 +08:00
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
2015-04-05 08:14:42 +08:00
|
|
|
}
|
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
/*
|
|
|
|
* don't remove from the dirty list until after we've waited
|
|
|
|
* on any pending IO
|
|
|
|
*/
|
2014-11-18 04:45:48 +08:00
|
|
|
list_del_init(&cache->dirty_list);
|
2015-12-18 11:02:48 +08:00
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
2015-04-05 08:14:42 +08:00
|
|
|
should_put = 1;
|
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
cache_save_setup(cache, trans, path);
|
2015-04-05 08:14:42 +08:00
|
|
|
|
2014-11-18 04:45:48 +08:00
|
|
|
if (!ret)
|
2018-03-15 23:27:37 +08:00
|
|
|
ret = btrfs_run_delayed_refs(trans,
|
2016-06-23 06:54:24 +08:00
|
|
|
(unsigned long) -1);
|
2015-04-05 08:14:42 +08:00
|
|
|
|
|
|
|
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
|
|
|
|
cache->io_ctl.inode = NULL;
|
2019-03-20 20:51:56 +08:00
|
|
|
ret = btrfs_write_out_cache(trans, cache, path);
|
2015-04-05 08:14:42 +08:00
|
|
|
if (ret == 0 && cache->io_ctl.inode) {
|
|
|
|
num_started++;
|
|
|
|
should_put = 0;
|
2015-04-07 03:46:08 +08:00
|
|
|
list_add_tail(&cache->io_list, io);
|
2015-04-05 08:14:42 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* if we failed to write the cache, the
|
|
|
|
* generation will be bad and life goes on
|
|
|
|
*/
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
}
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
if (!ret) {
|
2019-03-20 18:57:46 +08:00
|
|
|
ret = write_one_cache_group(trans, path, cache);
|
2015-12-30 10:42:30 +08:00
|
|
|
/*
|
|
|
|
* One of the free space endio workers might have
|
|
|
|
* created a new block group while updating a free space
|
|
|
|
* cache's inode (at inode.c:btrfs_finish_ordered_io())
|
|
|
|
* and hasn't released its transaction handle yet, in
|
|
|
|
* which case the new block group is still attached to
|
|
|
|
* its transaction handle and its creation has not
|
|
|
|
* finished yet (no block group item in the extent tree
|
|
|
|
* yet, etc). If this is the case, wait for all free
|
|
|
|
* space endio workers to finish and retry. This is a
|
|
|
|
* a very rare case so no need for a more efficient and
|
|
|
|
* complex approach.
|
|
|
|
*/
|
|
|
|
if (ret == -ENOENT) {
|
|
|
|
wait_event(cur_trans->writer_wait,
|
|
|
|
atomic_read(&cur_trans->num_writers) == 1);
|
2019-03-20 18:57:46 +08:00
|
|
|
ret = write_one_cache_group(trans, path, cache);
|
2015-12-30 10:42:30 +08:00
|
|
|
}
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
if (ret)
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
Btrfs: fix race between block group creation and their cache writeout
So creating a block group has 2 distinct phases:
Phase 1 - creates the btrfs_block_group_cache item and adds it to the
rbtree fs_info->block_group_cache_tree and to the corresponding list
space_info->block_groups[];
Phase 2 - adds the block group item to the extent tree and corresponding
items to the chunk tree.
The first phase adds the block_group_cache_item to a list of pending block
groups in the transaction handle, and phase 2 happens when
btrfs_end_transaction() is called against the transaction handle.
It happens that once phase 1 completes, other concurrent tasks that use
their own transaction handle, but points to the same running transaction
(struct btrfs_trans_handle->transaction), can use this block group for
space allocations and therefore mark it dirty. Dirty block groups are
tracked in a list belonging to the currently running transaction (struct
btrfs_transaction) and not in the transaction handle (btrfs_trans_handle).
This is a problem because once a task calls btrfs_commit_transaction(),
it calls btrfs_start_dirty_block_groups() which will see all dirty block
groups and attempt to start their writeout, including those that are
still attached to the transaction handle of some concurrent task that
hasn't called btrfs_end_transaction() yet - which means those block
groups haven't gone through phase 2 yet and therefore when
write_one_cache_group() is called, it won't find the block group items
in the extent tree and abort the current transaction with -ENOENT,
turning the fs into readonly mode and require a remount.
Fix this by ignoring -ENOENT when looking for block group items in the
extent tree when we attempt to start the writeout of the block group
caches outside the critical section of the transaction commit. We will
try again later during the critical section and if there we still don't
find the block group item in the extent tree, we then abort the current
transaction.
This issue happened twice, once while running fstests btrfs/067 and once
for btrfs/078, which produced the following trace:
[ 3278.703014] WARNING: CPU: 7 PID: 18499 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[ 3278.707329] BTRFS: Transaction aborted (error -2)
(...)
[ 3278.731555] Call Trace:
[ 3278.732396] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[ 3278.733860] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[ 3278.735312] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[ 3278.736874] [<ffffffffa03ada6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.738302] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[ 3278.739520] [<ffffffffa03ada6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[ 3278.741222] [<ffffffffa03b9e56>] write_one_cache_group+0xae/0xbf [btrfs]
[ 3278.742797] [<ffffffffa03c487b>] btrfs_start_dirty_block_groups+0x170/0x2b2 [btrfs]
[ 3278.744492] [<ffffffffa03d309c>] btrfs_commit_transaction+0x130/0x9c9 [btrfs]
[ 3278.746084] [<ffffffff8107d33d>] ? trace_hardirqs_on+0xd/0xf
[ 3278.747249] [<ffffffffa03e5660>] btrfs_sync_file+0x313/0x387 [btrfs]
[ 3278.748744] [<ffffffff8117acad>] vfs_fsync_range+0x95/0xa4
[ 3278.749958] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 3278.751218] [<ffffffff8117acd8>] vfs_fsync+0x1c/0x1e
[ 3278.754197] [<ffffffff8117ae54>] do_fsync+0x34/0x4e
[ 3278.755192] [<ffffffff8117b07c>] SyS_fsync+0x10/0x14
[ 3278.756236] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 3278.757366] ---[ end trace 9a4d4df4969709aa ]---
Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
outside critical section in commit")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-06 23:15:09 +08:00
|
|
|
}
|
2015-04-05 08:14:42 +08:00
|
|
|
|
|
|
|
/* if its not on the io list, we need to put the block group */
|
|
|
|
if (should_put)
|
|
|
|
btrfs_put_block_group(cache);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
btrfs_delayed_refs_rsv_release(fs_info, 1);
|
2015-12-18 11:02:48 +08:00
|
|
|
spin_lock(&cur_trans->dirty_bgs_lock);
|
2015-04-05 08:14:42 +08:00
|
|
|
}
|
2015-12-18 11:02:48 +08:00
|
|
|
spin_unlock(&cur_trans->dirty_bgs_lock);
|
2015-04-05 08:14:42 +08:00
|
|
|
|
2018-02-09 00:25:18 +08:00
|
|
|
/*
|
|
|
|
* Refer to the definition of io_bgs member for details why it's safe
|
|
|
|
* to use it without any locking
|
|
|
|
*/
|
2015-04-07 03:46:08 +08:00
|
|
|
while (!list_empty(io)) {
|
|
|
|
cache = list_first_entry(io, struct btrfs_block_group_cache,
|
2015-04-05 08:14:42 +08:00
|
|
|
io_list);
|
|
|
|
list_del_init(&cache->io_list);
|
2016-09-10 00:09:35 +08:00
|
|
|
btrfs_wait_cache_io(trans, cache, path);
|
2010-07-03 00:14:14 +08:00
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
}
|
|
|
|
|
2007-04-27 04:46:15 +08:00
|
|
|
btrfs_free_path(path);
|
2014-11-18 04:45:48 +08:00
|
|
|
return ret;
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
|
2008-12-12 05:30:39 +08:00
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
int readonly = 0;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
block_group = btrfs_lookup_block_group(fs_info, bytenr);
|
2008-12-12 05:30:39 +08:00
|
|
|
if (!block_group || block_group->ro)
|
|
|
|
readonly = 1;
|
|
|
|
if (block_group)
|
2009-04-03 21:47:43 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
2008-12-12 05:30:39 +08:00
|
|
|
return readonly;
|
|
|
|
}
|
|
|
|
|
2016-05-09 20:15:41 +08:00
|
|
|
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *bg;
|
|
|
|
bool ret = true;
|
|
|
|
|
|
|
|
bg = btrfs_lookup_block_group(fs_info, bytenr);
|
|
|
|
if (!bg)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
spin_lock(&bg->lock);
|
|
|
|
if (bg->ro)
|
|
|
|
ret = false;
|
|
|
|
else
|
|
|
|
atomic_inc(&bg->nocow_writers);
|
|
|
|
spin_unlock(&bg->lock);
|
|
|
|
|
|
|
|
/* no put on block group, done by btrfs_dec_nocow_writers */
|
|
|
|
if (!ret)
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *bg;
|
|
|
|
|
|
|
|
bg = btrfs_lookup_block_group(fs_info, bytenr);
|
|
|
|
ASSERT(bg);
|
|
|
|
if (atomic_dec_and_test(&bg->nocow_writers))
|
2018-03-15 18:43:08 +08:00
|
|
|
wake_up_var(&bg->nocow_writers);
|
2016-05-09 20:15:41 +08:00
|
|
|
/*
|
|
|
|
* Once for our lookup and once for the lookup done by a previous call
|
|
|
|
* to btrfs_inc_nocow_writers()
|
|
|
|
*/
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
|
|
|
|
{
|
2018-03-15 18:43:08 +08:00
|
|
|
wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
|
2016-05-09 20:15:41 +08:00
|
|
|
}
|
|
|
|
|
2008-04-04 04:29:03 +08:00
|
|
|
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
|
|
|
|
{
|
2012-03-27 22:09:16 +08:00
|
|
|
u64 extra_flags = chunk_to_extended(flags) &
|
|
|
|
BTRFS_EXTENDED_PROFILE_MASK;
|
2012-01-17 04:04:47 +08:00
|
|
|
|
2013-01-29 18:13:12 +08:00
|
|
|
write_seqlock(&fs_info->profiles_lock);
|
2012-01-17 04:04:47 +08:00
|
|
|
if (flags & BTRFS_BLOCK_GROUP_DATA)
|
|
|
|
fs_info->avail_data_alloc_bits |= extra_flags;
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_METADATA)
|
|
|
|
fs_info->avail_metadata_alloc_bits |= extra_flags;
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
|
|
|
|
fs_info->avail_system_alloc_bits |= extra_flags;
|
2013-01-29 18:13:12 +08:00
|
|
|
write_sequnlock(&fs_info->profiles_lock);
|
2008-04-04 04:29:03 +08:00
|
|
|
}
|
2008-03-26 04:50:33 +08:00
|
|
|
|
2012-03-27 22:09:17 +08:00
|
|
|
/*
|
|
|
|
* returns target flags in extended format or 0 if restripe for this
|
|
|
|
* chunk_type is not in progress
|
2012-04-13 04:03:56 +08:00
|
|
|
*
|
2018-03-21 07:20:05 +08:00
|
|
|
* should be called with balance_lock held
|
2012-03-27 22:09:17 +08:00
|
|
|
*/
|
|
|
|
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
|
|
|
|
{
|
|
|
|
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
|
|
|
|
u64 target = 0;
|
|
|
|
|
|
|
|
if (!bctl)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_DATA &&
|
|
|
|
bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
|
|
|
|
target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
|
|
|
|
} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
|
|
|
|
bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
|
|
|
|
target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
|
|
|
|
} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
|
|
|
|
bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
|
|
|
|
target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
|
|
|
|
}
|
|
|
|
|
|
|
|
return target;
|
|
|
|
}
|
|
|
|
|
2012-01-17 04:04:47 +08:00
|
|
|
/*
|
|
|
|
* @flags: available profiles in extended format (see ctree.h)
|
|
|
|
*
|
2012-01-17 04:04:48 +08:00
|
|
|
* Returns reduced profile in chunk format. If profile changing is in
|
|
|
|
* progress (either running or paused) picks the target profile (if it's
|
|
|
|
* already available), otherwise falls back to plain reducing.
|
2012-01-17 04:04:47 +08:00
|
|
|
*/
|
2016-06-23 06:54:24 +08:00
|
|
|
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
|
2008-04-29 03:29:52 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
u64 num_devices = fs_info->fs_devices->rw_devices;
|
2012-03-27 22:09:17 +08:00
|
|
|
u64 target;
|
2015-09-15 21:08:08 +08:00
|
|
|
u64 raid_type;
|
|
|
|
u64 allowed = 0;
|
2008-05-07 23:43:44 +08:00
|
|
|
|
2012-03-27 22:09:17 +08:00
|
|
|
/*
|
|
|
|
* see if restripe for this chunk_type is in progress, if so
|
|
|
|
* try to reduce to the target profile
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_lock(&fs_info->balance_lock);
|
|
|
|
target = get_restripe_target(fs_info, flags);
|
2012-03-27 22:09:17 +08:00
|
|
|
if (target) {
|
|
|
|
/* pick target profile only if it's already available */
|
|
|
|
if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_unlock(&fs_info->balance_lock);
|
2012-03-27 22:09:17 +08:00
|
|
|
return extended_to_chunk(target);
|
2012-01-17 04:04:48 +08:00
|
|
|
}
|
|
|
|
}
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_unlock(&fs_info->balance_lock);
|
2012-01-17 04:04:48 +08:00
|
|
|
|
2013-01-30 07:40:14 +08:00
|
|
|
/* First, mask out the RAID levels which aren't possible */
|
2015-09-15 21:08:08 +08:00
|
|
|
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
|
|
|
|
if (num_devices >= btrfs_raid_array[raid_type].devs_min)
|
2018-04-25 19:01:43 +08:00
|
|
|
allowed |= btrfs_raid_array[raid_type].bg_flag;
|
2015-09-15 21:08:08 +08:00
|
|
|
}
|
|
|
|
allowed &= flags;
|
|
|
|
|
|
|
|
if (allowed & BTRFS_BLOCK_GROUP_RAID6)
|
|
|
|
allowed = BTRFS_BLOCK_GROUP_RAID6;
|
|
|
|
else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
|
|
|
|
allowed = BTRFS_BLOCK_GROUP_RAID5;
|
|
|
|
else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
|
|
|
|
allowed = BTRFS_BLOCK_GROUP_RAID10;
|
|
|
|
else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
|
|
|
|
allowed = BTRFS_BLOCK_GROUP_RAID1;
|
|
|
|
else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
|
|
|
|
allowed = BTRFS_BLOCK_GROUP_RAID0;
|
|
|
|
|
|
|
|
flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
|
|
|
|
|
|
|
|
return extended_to_chunk(flags | allowed);
|
2008-04-29 03:29:52 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
|
2009-02-21 00:00:09 +08:00
|
|
|
{
|
2013-01-29 18:13:12 +08:00
|
|
|
unsigned seq;
|
2014-04-24 22:15:29 +08:00
|
|
|
u64 flags;
|
2013-01-29 18:13:12 +08:00
|
|
|
|
|
|
|
do {
|
2014-04-24 22:15:29 +08:00
|
|
|
flags = orig_flags;
|
2016-06-23 06:54:23 +08:00
|
|
|
seq = read_seqbegin(&fs_info->profiles_lock);
|
2013-01-29 18:13:12 +08:00
|
|
|
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_DATA)
|
2016-06-23 06:54:23 +08:00
|
|
|
flags |= fs_info->avail_data_alloc_bits;
|
2013-01-29 18:13:12 +08:00
|
|
|
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
|
2016-06-23 06:54:23 +08:00
|
|
|
flags |= fs_info->avail_system_alloc_bits;
|
2013-01-29 18:13:12 +08:00
|
|
|
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
|
2016-06-23 06:54:23 +08:00
|
|
|
flags |= fs_info->avail_metadata_alloc_bits;
|
|
|
|
} while (read_seqretry(&fs_info->profiles_lock, seq));
|
2012-01-17 04:04:47 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
return btrfs_reduce_alloc_profile(fs_info, flags);
|
2009-02-21 00:00:09 +08:00
|
|
|
}
|
|
|
|
|
2017-05-17 23:38:35 +08:00
|
|
|
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
|
2009-09-12 04:12:44 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2010-05-16 22:46:24 +08:00
|
|
|
u64 flags;
|
2013-01-30 07:40:14 +08:00
|
|
|
u64 ret;
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2010-05-16 22:46:24 +08:00
|
|
|
if (data)
|
|
|
|
flags = BTRFS_BLOCK_GROUP_DATA;
|
2016-06-23 06:54:23 +08:00
|
|
|
else if (root == fs_info->chunk_root)
|
2010-05-16 22:46:24 +08:00
|
|
|
flags = BTRFS_BLOCK_GROUP_SYSTEM;
|
2009-09-12 04:12:44 +08:00
|
|
|
else
|
2010-05-16 22:46:24 +08:00
|
|
|
flags = BTRFS_BLOCK_GROUP_METADATA;
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = get_alloc_profile(fs_info, flags);
|
2013-01-30 07:40:14 +08:00
|
|
|
return ret;
|
2009-02-21 00:00:09 +08:00
|
|
|
}
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2017-05-17 23:38:35 +08:00
|
|
|
u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
|
|
|
|
}
|
|
|
|
|
|
|
|
u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
|
|
|
|
}
|
|
|
|
|
2009-04-22 05:40:57 +08:00
|
|
|
static void force_metadata_allocation(struct btrfs_fs_info *info)
|
2009-10-08 08:44:34 +08:00
|
|
|
{
|
2009-04-22 05:40:57 +08:00
|
|
|
struct list_head *head = &info->space_info;
|
|
|
|
struct btrfs_space_info *found;
|
2009-10-08 08:44:34 +08:00
|
|
|
|
2009-04-22 05:40:57 +08:00
|
|
|
rcu_read_lock();
|
|
|
|
list_for_each_entry_rcu(found, head, list) {
|
|
|
|
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
|
2011-04-16 04:05:44 +08:00
|
|
|
found->force_alloc = CHUNK_ALLOC_FORCE;
|
2009-10-08 08:44:34 +08:00
|
|
|
}
|
2009-04-22 05:40:57 +08:00
|
|
|
rcu_read_unlock();
|
2009-10-08 08:44:34 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
|
2012-09-13 02:08:47 +08:00
|
|
|
struct btrfs_space_info *sinfo, int force)
|
2009-10-09 01:34:05 +08:00
|
|
|
{
|
2017-06-22 21:51:48 +08:00
|
|
|
u64 bytes_used = btrfs_space_info_used(sinfo, false);
|
2010-10-27 01:37:56 +08:00
|
|
|
u64 thresh;
|
2009-10-08 08:44:34 +08:00
|
|
|
|
2011-04-16 04:05:44 +08:00
|
|
|
if (force == CHUNK_ALLOC_FORCE)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* in limited mode, we want to have some free space up to
|
|
|
|
* about 1% of the FS size.
|
|
|
|
*/
|
|
|
|
if (force == CHUNK_ALLOC_LIMITED) {
|
2016-06-23 06:54:23 +08:00
|
|
|
thresh = btrfs_super_total_bytes(fs_info->super_copy);
|
2015-12-15 00:42:10 +08:00
|
|
|
thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
|
2011-04-16 04:05:44 +08:00
|
|
|
|
2017-06-22 21:51:48 +08:00
|
|
|
if (sinfo->total_bytes - bytes_used < thresh)
|
2011-04-16 04:05:44 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2017-06-22 21:51:48 +08:00
|
|
|
if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
|
2010-10-16 03:23:48 +08:00
|
|
|
return 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
return 1;
|
2009-10-09 01:34:05 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
|
2012-03-29 21:57:44 +08:00
|
|
|
{
|
|
|
|
u64 num_dev;
|
|
|
|
|
2019-05-17 17:43:20 +08:00
|
|
|
num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
|
|
|
|
if (!num_dev)
|
2016-06-23 06:54:23 +08:00
|
|
|
num_dev = fs_info->fs_devices->rw_devices;
|
2012-03-29 21:57:44 +08:00
|
|
|
|
Btrfs: fix -ENOSPC on block group removal
Unlike when attempting to allocate a new block group, where we check
that we have enough space in the system space_info to update the device
items and insert a new chunk item in the chunk tree, we were not checking
if the system space_info had enough space for updating the device items
and deleting the chunk item in the chunk tree. This often lead to -ENOSPC
error when attempting to allocate blocks for the chunk tree (during btree
node/leaf COW operations) while updating the device items or deleting the
chunk item, which resulted in the current transaction being aborted and
turning the filesystem into read-only mode.
While running fstests generic/038, which stresses allocation of block
groups and removal of unused block groups, with a large scratch device
(750Gb) this happened often, despite more than enough unallocated space,
and resulted in the following trace:
[68663.586604] WARNING: CPU: 3 PID: 1521 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[68663.600407] BTRFS: Transaction aborted (error -28)
(...)
[68663.730829] Call Trace:
[68663.732585] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[68663.734334] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[68663.739980] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[68663.757153] [<ffffffffa036ca6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.760925] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[68663.762854] [<ffffffffa03b159d>] ? btrfs_update_device+0x15a/0x16c [btrfs]
[68663.764073] [<ffffffffa036ca6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.765130] [<ffffffffa03b3638>] btrfs_remove_chunk+0x597/0x5ee [btrfs]
[68663.765998] [<ffffffffa0384663>] ? btrfs_delete_unused_bgs+0x245/0x296 [btrfs]
[68663.767068] [<ffffffffa0384676>] btrfs_delete_unused_bgs+0x258/0x296 [btrfs]
[68663.768227] [<ffffffff8143527f>] ? _raw_spin_unlock_irq+0x2d/0x4c
[68663.769081] [<ffffffffa038b109>] cleaner_kthread+0x13d/0x16c [btrfs]
[68663.799485] [<ffffffffa038afcc>] ? btrfs_alloc_root+0x28/0x28 [btrfs]
[68663.809208] [<ffffffff8105f367>] kthread+0xef/0xf7
[68663.828795] [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[68663.844942] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.846486] [<ffffffff81435a88>] ret_from_fork+0x58/0x90
[68663.847760] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.849503] ---[ end trace 798477c6d6dbaad6 ]---
[68663.850525] BTRFS: error (device sdc) in btrfs_remove_chunk:2652: errno=-28 No space left
So fix this by verifying that enough space exists in system space_info,
and reserving the space in the chunk block reserve, before attempting to
delete the block group and allocate a new system chunk if we don't have
enough space to perform the necessary updates and delete in the chunk
tree. Like for the block group creation case, we don't error our if we
fail to allocate a new system chunk, since we might end up not needing
it (no node/leaf splits happen during the COW operations and/or we end
up not needing to COW any btree nodes or leafs because they were already
COWed in the current transaction and their writeback didn't start yet).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:55 +08:00
|
|
|
return num_dev;
|
2012-03-29 21:57:44 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: fix -ENOSPC on block group removal
Unlike when attempting to allocate a new block group, where we check
that we have enough space in the system space_info to update the device
items and insert a new chunk item in the chunk tree, we were not checking
if the system space_info had enough space for updating the device items
and deleting the chunk item in the chunk tree. This often lead to -ENOSPC
error when attempting to allocate blocks for the chunk tree (during btree
node/leaf COW operations) while updating the device items or deleting the
chunk item, which resulted in the current transaction being aborted and
turning the filesystem into read-only mode.
While running fstests generic/038, which stresses allocation of block
groups and removal of unused block groups, with a large scratch device
(750Gb) this happened often, despite more than enough unallocated space,
and resulted in the following trace:
[68663.586604] WARNING: CPU: 3 PID: 1521 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[68663.600407] BTRFS: Transaction aborted (error -28)
(...)
[68663.730829] Call Trace:
[68663.732585] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[68663.734334] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[68663.739980] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[68663.757153] [<ffffffffa036ca6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.760925] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[68663.762854] [<ffffffffa03b159d>] ? btrfs_update_device+0x15a/0x16c [btrfs]
[68663.764073] [<ffffffffa036ca6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.765130] [<ffffffffa03b3638>] btrfs_remove_chunk+0x597/0x5ee [btrfs]
[68663.765998] [<ffffffffa0384663>] ? btrfs_delete_unused_bgs+0x245/0x296 [btrfs]
[68663.767068] [<ffffffffa0384676>] btrfs_delete_unused_bgs+0x258/0x296 [btrfs]
[68663.768227] [<ffffffff8143527f>] ? _raw_spin_unlock_irq+0x2d/0x4c
[68663.769081] [<ffffffffa038b109>] cleaner_kthread+0x13d/0x16c [btrfs]
[68663.799485] [<ffffffffa038afcc>] ? btrfs_alloc_root+0x28/0x28 [btrfs]
[68663.809208] [<ffffffff8105f367>] kthread+0xef/0xf7
[68663.828795] [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[68663.844942] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.846486] [<ffffffff81435a88>] ret_from_fork+0x58/0x90
[68663.847760] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.849503] ---[ end trace 798477c6d6dbaad6 ]---
[68663.850525] BTRFS: error (device sdc) in btrfs_remove_chunk:2652: errno=-28 No space left
So fix this by verifying that enough space exists in system space_info,
and reserving the space in the chunk block reserve, before attempting to
delete the block group and allocate a new system chunk if we don't have
enough space to perform the necessary updates and delete in the chunk
tree. Like for the block group creation case, we don't error our if we
fail to allocate a new system chunk, since we might end up not needing
it (no node/leaf splits happen during the COW operations and/or we end
up not needing to COW any btree nodes or leafs because they were already
COWed in the current transaction and their writeback didn't start yet).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:55 +08:00
|
|
|
/*
|
|
|
|
* If @is_allocation is true, reserve space in the system space info necessary
|
|
|
|
* for allocating a chunk, otherwise if it's false, reserve space necessary for
|
|
|
|
* removing a chunk.
|
|
|
|
*/
|
2018-06-20 20:49:07 +08:00
|
|
|
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
|
2012-03-29 21:57:44 +08:00
|
|
|
{
|
2018-06-20 20:49:07 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-03-29 21:57:44 +08:00
|
|
|
struct btrfs_space_info *info;
|
|
|
|
u64 left;
|
|
|
|
u64 thresh;
|
Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:
[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323] [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862] [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116] [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593] [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960] [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649] [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092] [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218] [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622] [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642] [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692] [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---
This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.
So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.
Fix this by doing proper reservation in the chunk block reserve.
The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:54 +08:00
|
|
|
int ret = 0;
|
Btrfs: fix -ENOSPC on block group removal
Unlike when attempting to allocate a new block group, where we check
that we have enough space in the system space_info to update the device
items and insert a new chunk item in the chunk tree, we were not checking
if the system space_info had enough space for updating the device items
and deleting the chunk item in the chunk tree. This often lead to -ENOSPC
error when attempting to allocate blocks for the chunk tree (during btree
node/leaf COW operations) while updating the device items or deleting the
chunk item, which resulted in the current transaction being aborted and
turning the filesystem into read-only mode.
While running fstests generic/038, which stresses allocation of block
groups and removal of unused block groups, with a large scratch device
(750Gb) this happened often, despite more than enough unallocated space,
and resulted in the following trace:
[68663.586604] WARNING: CPU: 3 PID: 1521 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[68663.600407] BTRFS: Transaction aborted (error -28)
(...)
[68663.730829] Call Trace:
[68663.732585] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[68663.734334] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[68663.739980] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[68663.757153] [<ffffffffa036ca6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.760925] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[68663.762854] [<ffffffffa03b159d>] ? btrfs_update_device+0x15a/0x16c [btrfs]
[68663.764073] [<ffffffffa036ca6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.765130] [<ffffffffa03b3638>] btrfs_remove_chunk+0x597/0x5ee [btrfs]
[68663.765998] [<ffffffffa0384663>] ? btrfs_delete_unused_bgs+0x245/0x296 [btrfs]
[68663.767068] [<ffffffffa0384676>] btrfs_delete_unused_bgs+0x258/0x296 [btrfs]
[68663.768227] [<ffffffff8143527f>] ? _raw_spin_unlock_irq+0x2d/0x4c
[68663.769081] [<ffffffffa038b109>] cleaner_kthread+0x13d/0x16c [btrfs]
[68663.799485] [<ffffffffa038afcc>] ? btrfs_alloc_root+0x28/0x28 [btrfs]
[68663.809208] [<ffffffff8105f367>] kthread+0xef/0xf7
[68663.828795] [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[68663.844942] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.846486] [<ffffffff81435a88>] ret_from_fork+0x58/0x90
[68663.847760] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.849503] ---[ end trace 798477c6d6dbaad6 ]---
[68663.850525] BTRFS: error (device sdc) in btrfs_remove_chunk:2652: errno=-28 No space left
So fix this by verifying that enough space exists in system space_info,
and reserving the space in the chunk block reserve, before attempting to
delete the block group and allocate a new system chunk if we don't have
enough space to perform the necessary updates and delete in the chunk
tree. Like for the block group creation case, we don't error our if we
fail to allocate a new system chunk, since we might end up not needing
it (no node/leaf splits happen during the COW operations and/or we end
up not needing to COW any btree nodes or leafs because they were already
COWed in the current transaction and their writeback didn't start yet).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:55 +08:00
|
|
|
u64 num_devs;
|
Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:
[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323] [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862] [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116] [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593] [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960] [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649] [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092] [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218] [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622] [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642] [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692] [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---
This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.
So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.
Fix this by doing proper reservation in the chunk block reserve.
The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:54 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Needed because we can end up allocating a system chunk and for an
|
|
|
|
* atomic and race free space reservation in the chunk block reserve.
|
|
|
|
*/
|
2018-03-16 09:21:22 +08:00
|
|
|
lockdep_assert_held(&fs_info->chunk_mutex);
|
2012-03-29 21:57:44 +08:00
|
|
|
|
2019-06-19 04:09:19 +08:00
|
|
|
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
|
2012-03-29 21:57:44 +08:00
|
|
|
spin_lock(&info->lock);
|
2017-02-14 07:42:21 +08:00
|
|
|
left = info->total_bytes - btrfs_space_info_used(info, true);
|
2012-03-29 21:57:44 +08:00
|
|
|
spin_unlock(&info->lock);
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
num_devs = get_profile_num_devs(fs_info, type);
|
Btrfs: fix -ENOSPC on block group removal
Unlike when attempting to allocate a new block group, where we check
that we have enough space in the system space_info to update the device
items and insert a new chunk item in the chunk tree, we were not checking
if the system space_info had enough space for updating the device items
and deleting the chunk item in the chunk tree. This often lead to -ENOSPC
error when attempting to allocate blocks for the chunk tree (during btree
node/leaf COW operations) while updating the device items or deleting the
chunk item, which resulted in the current transaction being aborted and
turning the filesystem into read-only mode.
While running fstests generic/038, which stresses allocation of block
groups and removal of unused block groups, with a large scratch device
(750Gb) this happened often, despite more than enough unallocated space,
and resulted in the following trace:
[68663.586604] WARNING: CPU: 3 PID: 1521 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[68663.600407] BTRFS: Transaction aborted (error -28)
(...)
[68663.730829] Call Trace:
[68663.732585] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[68663.734334] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[68663.739980] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[68663.757153] [<ffffffffa036ca6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.760925] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[68663.762854] [<ffffffffa03b159d>] ? btrfs_update_device+0x15a/0x16c [btrfs]
[68663.764073] [<ffffffffa036ca6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.765130] [<ffffffffa03b3638>] btrfs_remove_chunk+0x597/0x5ee [btrfs]
[68663.765998] [<ffffffffa0384663>] ? btrfs_delete_unused_bgs+0x245/0x296 [btrfs]
[68663.767068] [<ffffffffa0384676>] btrfs_delete_unused_bgs+0x258/0x296 [btrfs]
[68663.768227] [<ffffffff8143527f>] ? _raw_spin_unlock_irq+0x2d/0x4c
[68663.769081] [<ffffffffa038b109>] cleaner_kthread+0x13d/0x16c [btrfs]
[68663.799485] [<ffffffffa038afcc>] ? btrfs_alloc_root+0x28/0x28 [btrfs]
[68663.809208] [<ffffffff8105f367>] kthread+0xef/0xf7
[68663.828795] [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[68663.844942] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.846486] [<ffffffff81435a88>] ret_from_fork+0x58/0x90
[68663.847760] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.849503] ---[ end trace 798477c6d6dbaad6 ]---
[68663.850525] BTRFS: error (device sdc) in btrfs_remove_chunk:2652: errno=-28 No space left
So fix this by verifying that enough space exists in system space_info,
and reserving the space in the chunk block reserve, before attempting to
delete the block group and allocate a new system chunk if we don't have
enough space to perform the necessary updates and delete in the chunk
tree. Like for the block group creation case, we don't error our if we
fail to allocate a new system chunk, since we might end up not needing
it (no node/leaf splits happen during the COW operations and/or we end
up not needing to COW any btree nodes or leafs because they were already
COWed in the current transaction and their writeback didn't start yet).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:55 +08:00
|
|
|
|
|
|
|
/* num_devs device items to update and 1 chunk item to add or remove */
|
2016-06-23 06:54:23 +08:00
|
|
|
thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
|
|
|
|
btrfs_calc_trans_metadata_size(fs_info, 1);
|
Btrfs: fix -ENOSPC on block group removal
Unlike when attempting to allocate a new block group, where we check
that we have enough space in the system space_info to update the device
items and insert a new chunk item in the chunk tree, we were not checking
if the system space_info had enough space for updating the device items
and deleting the chunk item in the chunk tree. This often lead to -ENOSPC
error when attempting to allocate blocks for the chunk tree (during btree
node/leaf COW operations) while updating the device items or deleting the
chunk item, which resulted in the current transaction being aborted and
turning the filesystem into read-only mode.
While running fstests generic/038, which stresses allocation of block
groups and removal of unused block groups, with a large scratch device
(750Gb) this happened often, despite more than enough unallocated space,
and resulted in the following trace:
[68663.586604] WARNING: CPU: 3 PID: 1521 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[68663.600407] BTRFS: Transaction aborted (error -28)
(...)
[68663.730829] Call Trace:
[68663.732585] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[68663.734334] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[68663.739980] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[68663.757153] [<ffffffffa036ca6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.760925] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[68663.762854] [<ffffffffa03b159d>] ? btrfs_update_device+0x15a/0x16c [btrfs]
[68663.764073] [<ffffffffa036ca6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.765130] [<ffffffffa03b3638>] btrfs_remove_chunk+0x597/0x5ee [btrfs]
[68663.765998] [<ffffffffa0384663>] ? btrfs_delete_unused_bgs+0x245/0x296 [btrfs]
[68663.767068] [<ffffffffa0384676>] btrfs_delete_unused_bgs+0x258/0x296 [btrfs]
[68663.768227] [<ffffffff8143527f>] ? _raw_spin_unlock_irq+0x2d/0x4c
[68663.769081] [<ffffffffa038b109>] cleaner_kthread+0x13d/0x16c [btrfs]
[68663.799485] [<ffffffffa038afcc>] ? btrfs_alloc_root+0x28/0x28 [btrfs]
[68663.809208] [<ffffffff8105f367>] kthread+0xef/0xf7
[68663.828795] [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[68663.844942] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.846486] [<ffffffff81435a88>] ret_from_fork+0x58/0x90
[68663.847760] [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.849503] ---[ end trace 798477c6d6dbaad6 ]---
[68663.850525] BTRFS: error (device sdc) in btrfs_remove_chunk:2652: errno=-28 No space left
So fix this by verifying that enough space exists in system space_info,
and reserving the space in the chunk block reserve, before attempting to
delete the block group and allocate a new system chunk if we don't have
enough space to perform the necessary updates and delete in the chunk
tree. Like for the block group creation case, we don't error our if we
fail to allocate a new system chunk, since we might end up not needing
it (no node/leaf splits happen during the COW operations and/or we end
up not needing to COW any btree nodes or leafs because they were already
COWed in the current transaction and their writeback didn't start yet).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:55 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
|
|
|
|
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
|
|
|
|
left, thresh, type);
|
2019-06-19 04:09:24 +08:00
|
|
|
btrfs_dump_space_info(fs_info, info, 0, 0);
|
2012-03-29 21:57:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (left < thresh) {
|
2017-05-17 23:38:35 +08:00
|
|
|
u64 flags = btrfs_system_alloc_profile(fs_info);
|
2012-03-29 21:57:44 +08:00
|
|
|
|
Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:
[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323] [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862] [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116] [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593] [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960] [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649] [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092] [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218] [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622] [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642] [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692] [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---
This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.
So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.
Fix this by doing proper reservation in the chunk block reserve.
The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:54 +08:00
|
|
|
/*
|
|
|
|
* Ignore failure to create system chunk. We might end up not
|
|
|
|
* needing it, as we might not need to COW all nodes/leafs from
|
|
|
|
* the paths we visit in the chunk tree (they were already COWed
|
|
|
|
* or created in the current transaction for example).
|
|
|
|
*/
|
2018-06-20 20:49:06 +08:00
|
|
|
ret = btrfs_alloc_chunk(trans, flags);
|
Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:
[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323] [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862] [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116] [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593] [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960] [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649] [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092] [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218] [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622] [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642] [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692] [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---
This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.
So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.
Fix this by doing proper reservation in the chunk block reserve.
The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!ret) {
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_block_rsv_add(fs_info->chunk_root,
|
|
|
|
&fs_info->chunk_block_rsv,
|
Btrfs: fix -ENOSPC when finishing block group creation
While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:
[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906] [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522] [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171] [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323] [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213] [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862] [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116] [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593] [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960] [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649] [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092] [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218] [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622] [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642] [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692] [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---
This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.
So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.
Fix this by doing proper reservation in the chunk block reserve.
The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-20 21:01:54 +08:00
|
|
|
thresh, BTRFS_RESERVE_NO_FLUSH);
|
|
|
|
if (!ret)
|
|
|
|
trans->chunk_bytes_reserved += thresh;
|
2012-03-29 21:57:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-30 02:09:50 +08:00
|
|
|
/*
|
|
|
|
* If force is CHUNK_ALLOC_FORCE:
|
|
|
|
* - return 1 if it successfully allocates a chunk,
|
|
|
|
* - return errors including -ENOSPC otherwise.
|
|
|
|
* If force is NOT CHUNK_ALLOC_FORCE:
|
|
|
|
* - return 0 if it doesn't need to allocate a new chunk,
|
|
|
|
* - return 1 if it successfully allocates a chunk,
|
|
|
|
* - return errors including -ENOSPC otherwise.
|
|
|
|
*/
|
2019-06-19 04:09:17 +08:00
|
|
|
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
|
|
|
|
enum btrfs_chunk_alloc_enum force)
|
2009-09-12 04:12:44 +08:00
|
|
|
{
|
2018-06-20 20:49:05 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2008-03-25 03:01:59 +08:00
|
|
|
struct btrfs_space_info *space_info;
|
2018-04-18 15:27:57 +08:00
|
|
|
bool wait_for_alloc = false;
|
|
|
|
bool should_alloc = false;
|
2009-09-12 04:12:44 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2012-12-18 22:16:16 +08:00
|
|
|
/* Don't re-enter if we're already allocating a chunk */
|
|
|
|
if (trans->allocating_chunk)
|
|
|
|
return -ENOSPC;
|
|
|
|
|
2019-06-19 04:09:19 +08:00
|
|
|
space_info = btrfs_find_space_info(fs_info, flags);
|
2018-03-21 03:25:25 +08:00
|
|
|
ASSERT(space_info);
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2018-04-18 15:27:57 +08:00
|
|
|
do {
|
|
|
|
spin_lock(&space_info->lock);
|
|
|
|
if (force < space_info->force_alloc)
|
|
|
|
force = space_info->force_alloc;
|
|
|
|
should_alloc = should_alloc_chunk(fs_info, space_info, force);
|
|
|
|
if (space_info->full) {
|
|
|
|
/* No more free physical space */
|
|
|
|
if (should_alloc)
|
|
|
|
ret = -ENOSPC;
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
return ret;
|
|
|
|
} else if (!should_alloc) {
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
return 0;
|
|
|
|
} else if (space_info->chunk_alloc) {
|
|
|
|
/*
|
|
|
|
* Someone is already allocating, so we need to block
|
|
|
|
* until this someone is finished and then loop to
|
|
|
|
* recheck if we should continue with our allocation
|
|
|
|
* attempt.
|
|
|
|
*/
|
|
|
|
wait_for_alloc = true;
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
} else {
|
|
|
|
/* Proceed with allocation */
|
|
|
|
space_info->chunk_alloc = 1;
|
|
|
|
wait_for_alloc = false;
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
}
|
2011-04-12 08:20:11 +08:00
|
|
|
|
2018-04-05 15:40:15 +08:00
|
|
|
cond_resched();
|
2018-04-18 15:27:57 +08:00
|
|
|
} while (wait_for_alloc);
|
2011-04-12 08:20:11 +08:00
|
|
|
|
2018-04-18 15:27:57 +08:00
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
2012-12-18 22:16:16 +08:00
|
|
|
trans->allocating_chunk = true;
|
|
|
|
|
2010-09-17 04:19:09 +08:00
|
|
|
/*
|
|
|
|
* If we have mixed data/metadata chunks we want to make sure we keep
|
|
|
|
* allocating mixed chunks instead of individual chunks.
|
|
|
|
*/
|
|
|
|
if (btrfs_mixed_space_info(space_info))
|
|
|
|
flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
|
|
|
|
|
2009-04-22 05:40:57 +08:00
|
|
|
/*
|
|
|
|
* if we're doing a data chunk, go ahead and make sure that
|
|
|
|
* we keep a reasonable number of metadata chunks allocated in the
|
|
|
|
* FS as well.
|
|
|
|
*/
|
2009-09-12 04:12:44 +08:00
|
|
|
if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
|
2009-04-22 05:40:57 +08:00
|
|
|
fs_info->data_chunk_allocations++;
|
|
|
|
if (!(fs_info->data_chunk_allocations %
|
|
|
|
fs_info->metadata_ratio))
|
|
|
|
force_metadata_allocation(fs_info);
|
2009-09-12 04:12:44 +08:00
|
|
|
}
|
|
|
|
|
2012-03-29 21:57:44 +08:00
|
|
|
/*
|
|
|
|
* Check if we have enough space in SYSTEM chunk because we may need
|
|
|
|
* to update devices.
|
|
|
|
*/
|
2018-06-20 20:49:07 +08:00
|
|
|
check_system_chunk(trans, flags);
|
2012-03-29 21:57:44 +08:00
|
|
|
|
2018-06-20 20:49:06 +08:00
|
|
|
ret = btrfs_alloc_chunk(trans, flags);
|
2012-12-18 22:16:16 +08:00
|
|
|
trans->allocating_chunk = false;
|
2011-07-13 01:57:59 +08:00
|
|
|
|
2009-09-12 04:12:44 +08:00
|
|
|
spin_lock(&space_info->lock);
|
2018-04-11 16:21:19 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
if (ret == -ENOSPC)
|
|
|
|
space_info->full = 1;
|
|
|
|
else
|
|
|
|
goto out;
|
|
|
|
} else {
|
2010-05-16 22:46:25 +08:00
|
|
|
ret = 1;
|
2018-10-12 03:54:03 +08:00
|
|
|
space_info->max_extent_size = 0;
|
2018-04-11 16:21:19 +08:00
|
|
|
}
|
2011-04-12 08:20:11 +08:00
|
|
|
|
2011-04-16 04:05:44 +08:00
|
|
|
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
|
clear chunk_alloc flag on retryable failure
I've experienced filesystem freezes with permanent spikes in the active
process count for quite a while, particularly on filesystems whose
available raw space has already been fully allocated to chunks.
While looking into this, I found a pretty obvious error in
do_chunk_alloc: it sets space_info->chunk_alloc, but if
btrfs_alloc_chunk returns an error other than ENOSPC, it returns leaving
that flag set, which causes any other threads waiting for
space_info->chunk_alloc to become zero to spin indefinitely.
I haven't double-checked that this patch fixes the failure I've observed
fully (it's not exactly trivial to trigger), but it surely is a bug and
the fix is trivial, so... Please put it in :-)
What I saw in that function also happens to explain why in some cases I
see filesystems allocate a huge number of chunks that remain unused
(leading to the scenario above, of not having more chunks to allocate).
It happens for data and metadata, but not necessarily both. I'm
guessing some thread sets the force_alloc flag on the corresponding
space_info, and then several threads trying to get disk space end up
attempting to allocate a new chunk concurrently. All of them will see
the force_alloc flag and bump their local copy of force up to the level
they see first, and they won't clear it even if another thread succeeds
in allocating a chunk, thus clearing the force flag. Then each thread
that observed the force flag will, on its turn, force the allocation of
a new chunk. And any threads that come in while it does that will see
the force flag still set and pick it up, and so on. This sounds like a
problem to me, but... what should the correct behavior be? Clear
force_flag once we copy it to a local force? Reset force to the
incoming value on every loop? Set the flag to our incoming force if we
have it at first, clear our local flag, and move it from the space_info
when we determined that we are the thread that's going to perform the
allocation?
btrfs: clear chunk_alloc flag on retryable failure
From: Alexandre Oliva <oliva@gnu.org>
If btrfs_alloc_chunk fails with e.g. ENOMEM, we exit do_chunk_alloc
without clearing chunk_alloc in space_info. As a result, any further
calls to do_chunk_alloc on that filesystem will start busy-waiting for
chunk_alloc to be cleared, but it never will be. This patch adjusts
do_chunk_alloc so that it clears this flag in case of an error.
Signed-off-by: Alexandre Oliva <oliva@gnu.org>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-22 05:15:14 +08:00
|
|
|
out:
|
2011-04-12 08:20:11 +08:00
|
|
|
space_info->chunk_alloc = 0;
|
2009-09-12 04:12:44 +08:00
|
|
|
spin_unlock(&space_info->lock);
|
2012-04-18 14:59:29 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2015-07-20 21:56:20 +08:00
|
|
|
/*
|
|
|
|
* When we allocate a new chunk we reserve space in the chunk block
|
|
|
|
* reserve to make sure we can COW nodes/leafs in the chunk tree or
|
|
|
|
* add new nodes/leafs to it if we end up needing to do it when
|
|
|
|
* inserting the chunk item and updating device items as part of the
|
|
|
|
* second phase of chunk allocation, performed by
|
|
|
|
* btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
|
|
|
|
* large number of new block groups to create in our transaction
|
|
|
|
* handle's new_bgs list to avoid exhausting the chunk block reserve
|
|
|
|
* in extreme cases - like having a single transaction create many new
|
|
|
|
* block groups when starting to write out the free space caches of all
|
|
|
|
* the block groups that were made dirty during the lifetime of the
|
|
|
|
* transaction.
|
|
|
|
*/
|
2018-10-12 17:03:55 +08:00
|
|
|
if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
|
2018-02-07 23:55:40 +08:00
|
|
|
btrfs_create_pending_block_groups(trans);
|
2018-10-12 17:03:55 +08:00
|
|
|
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
return ret;
|
2008-03-25 03:01:59 +08:00
|
|
|
}
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2014-11-18 04:45:48 +08:00
|
|
|
static int update_block_group(struct btrfs_trans_handle *trans,
|
2019-03-20 19:10:15 +08:00
|
|
|
u64 bytenr, u64 num_bytes, int alloc)
|
2007-04-27 04:46:15 +08:00
|
|
|
{
|
2019-03-20 19:10:15 +08:00
|
|
|
struct btrfs_fs_info *info = trans->fs_info;
|
2010-06-22 02:48:16 +08:00
|
|
|
struct btrfs_block_group_cache *cache = NULL;
|
2007-10-16 04:15:53 +08:00
|
|
|
u64 total = num_bytes;
|
2007-04-27 04:46:15 +08:00
|
|
|
u64 old_val;
|
2007-10-16 04:15:53 +08:00
|
|
|
u64 byte_in_group;
|
2010-06-22 02:48:16 +08:00
|
|
|
int factor;
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
int ret = 0;
|
2007-05-08 08:03:49 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
/* block accounting for super block */
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_lock(&info->delalloc_root_lock);
|
2011-04-13 21:41:04 +08:00
|
|
|
old_val = btrfs_super_bytes_used(info->super_copy);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (alloc)
|
|
|
|
old_val += num_bytes;
|
|
|
|
else
|
|
|
|
old_val -= num_bytes;
|
2011-04-13 21:41:04 +08:00
|
|
|
btrfs_set_super_bytes_used(info->super_copy, old_val);
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_unlock(&info->delalloc_root_lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (total) {
|
2007-10-16 04:15:53 +08:00
|
|
|
cache = btrfs_lookup_block_group(info, bytenr);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
if (!cache) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
break;
|
|
|
|
}
|
2018-07-14 02:46:30 +08:00
|
|
|
factor = btrfs_bg_type_to_factor(cache->flags);
|
|
|
|
|
2010-08-26 04:54:15 +08:00
|
|
|
/*
|
|
|
|
* If this block group has free space cache written out, we
|
|
|
|
* need to make sure to load it if we are removing space. This
|
|
|
|
* is because we need the unpinning stage to actually add the
|
|
|
|
* space back to the block group, otherwise we will leak space.
|
|
|
|
*/
|
|
|
|
if (!alloc && cache->cached == BTRFS_CACHE_NO)
|
2012-12-27 17:01:18 +08:00
|
|
|
cache_block_group(cache, 1);
|
2010-06-22 02:48:16 +08:00
|
|
|
|
2007-10-16 04:15:53 +08:00
|
|
|
byte_in_group = bytenr - cache->key.objectid;
|
|
|
|
WARN_ON(byte_in_group > cache->key.offset);
|
2007-04-27 04:46:15 +08:00
|
|
|
|
Btrfs: nuke fs wide allocation mutex V2
This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.
There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.
The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same. Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.
Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one. I have tested this heavily and it does
not appear to break anything. This has to be applied on top of my
find_free_extent redo patch.
I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
spin_lock(&cache->space_info->lock);
|
2008-07-23 11:06:41 +08:00
|
|
|
spin_lock(&cache->lock);
|
2010-06-22 02:48:16 +08:00
|
|
|
|
2016-06-23 06:54:22 +08:00
|
|
|
if (btrfs_test_opt(info, SPACE_CACHE) &&
|
2010-06-22 02:48:16 +08:00
|
|
|
cache->disk_cache_state < BTRFS_DC_CLEAR)
|
|
|
|
cache->disk_cache_state = BTRFS_DC_CLEAR;
|
|
|
|
|
2007-04-27 04:46:15 +08:00
|
|
|
old_val = btrfs_block_group_used(&cache->item);
|
2007-10-16 04:15:53 +08:00
|
|
|
num_bytes = min(total, cache->key.offset - byte_in_group);
|
2007-04-27 22:08:34 +08:00
|
|
|
if (alloc) {
|
2007-10-16 04:15:53 +08:00
|
|
|
old_val += num_bytes;
|
2009-09-12 04:11:19 +08:00
|
|
|
btrfs_set_block_group_used(&cache->item, old_val);
|
|
|
|
cache->reserved -= num_bytes;
|
|
|
|
cache->space_info->bytes_reserved -= num_bytes;
|
2010-05-16 22:46:24 +08:00
|
|
|
cache->space_info->bytes_used += num_bytes;
|
|
|
|
cache->space_info->disk_used += num_bytes * factor;
|
2008-07-23 11:06:41 +08:00
|
|
|
spin_unlock(&cache->lock);
|
Btrfs: nuke fs wide allocation mutex V2
This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.
There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.
The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same. Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.
Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one. I have tested this heavily and it does
not appear to break anything. This has to be applied on top of my
find_free_extent redo patch.
I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
spin_unlock(&cache->space_info->lock);
|
2007-04-27 22:08:34 +08:00
|
|
|
} else {
|
2007-10-16 04:15:53 +08:00
|
|
|
old_val -= num_bytes;
|
2014-11-26 23:28:52 +08:00
|
|
|
btrfs_set_block_group_used(&cache->item, old_val);
|
|
|
|
cache->pinned += num_bytes;
|
2019-06-19 04:09:21 +08:00
|
|
|
btrfs_space_info_update_bytes_pinned(info,
|
|
|
|
cache->space_info, num_bytes);
|
2014-11-26 23:28:52 +08:00
|
|
|
cache->space_info->bytes_used -= num_bytes;
|
|
|
|
cache->space_info->disk_used -= num_bytes * factor;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&cache->space_info->lock);
|
2014-09-18 23:20:02 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
trace_btrfs_space_reservation(info, "pinned",
|
2016-03-26 01:25:54 +08:00
|
|
|
cache->space_info->flags,
|
|
|
|
num_bytes, 1);
|
btrfs: use customized batch size for total_bytes_pinned
In commit b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly
pinned bytes") we use total_bytes_pinned to track how many bytes we are
going to free in this transaction. When we are close to ENOSPC, we check it
and know if we can make the allocation by commit the current transaction.
For every data/metadata extent we are going to free, we add
total_bytes_pinned in btrfs_free_extent() and btrfs_free_tree_block(), and
release it in unpin_extent_range() when we finish the transaction. So this
is a variable we frequently update but rarely read - just the suitable
use of percpu_counter. But in previous commit we update total_bytes_pinned
by default 32 batch size, making every update essentially a spin lock
protected update. Since every spin lock/unlock operation involves syncing
a globally used variable and some kind of barrier in a SMP system, this is
more expensive than using total_bytes_pinned as a simple atomic64_t.
So fix this by using a customized batch size. Since we only read
total_bytes_pinned when we are close to ENOSPC and fail to allocate new
chunk, we can use a really large batch size and have nearly no penalty
in most cases.
[Test]
We tested the patch on a 4-cores x86 machine:
1. fallocate a 16GiB size test file
2. take snapshot (so all following writes will be COW)
3. run a 180 sec, 4 jobs, 4K random write fio on test file
We also added a temporary lockdep class on percpu_counter's spin lock
used by total_bytes_pinned to track it by lock_stat.
[Results]
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 82 82
0.21 0.61 29.46 0.36 298340
635973 0.09 11.01 173476.25 0.27
patched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 1 1
0.62 0.62 0.62 0.62 13601
31542 0.14 9.61 11016.90 0.35
[Analysis]
Since the spin lock only protects a single in-memory variable, the
contentions (number of lock acquisitions that had to wait) in both
unpatched and patched version are low. But when we see acquisitions and
acq-bounces, we get much lower counts in patched version. Here the most
important metric is acq-bounces. It means how many times the lock gets
transferred between different cpus, so the patch can really reduce
cacheline bouncing of spin lock (also the global counter of percpu_counter)
in a SMP system.
Fixes: b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly pinned bytes")
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-07-13 16:50:42 +08:00
|
|
|
percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
|
|
|
|
num_bytes,
|
|
|
|
BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
2014-11-26 23:28:52 +08:00
|
|
|
set_extent_dirty(info->pinned_extents,
|
|
|
|
bytenr, bytenr + num_bytes - 1,
|
|
|
|
GFP_NOFS | __GFP_NOFAIL);
|
2007-04-27 22:08:34 +08:00
|
|
|
}
|
2015-04-07 03:46:08 +08:00
|
|
|
|
|
|
|
spin_lock(&trans->transaction->dirty_bgs_lock);
|
|
|
|
if (list_empty(&cache->dirty_list)) {
|
|
|
|
list_add_tail(&cache->dirty_list,
|
|
|
|
&trans->transaction->dirty_bgs);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
trans->delayed_ref_updates++;
|
2015-04-07 03:46:08 +08:00
|
|
|
btrfs_get_block_group(cache);
|
|
|
|
}
|
|
|
|
spin_unlock(&trans->transaction->dirty_bgs_lock);
|
|
|
|
|
2015-11-23 23:25:16 +08:00
|
|
|
/*
|
|
|
|
* No longer have used bytes in this block group, queue it for
|
|
|
|
* deletion. We do this after adding the block group to the
|
|
|
|
* dirty list to avoid races between cleaner kthread and space
|
|
|
|
* cache writeout.
|
|
|
|
*/
|
2018-05-22 16:43:47 +08:00
|
|
|
if (!alloc && old_val == 0)
|
|
|
|
btrfs_mark_bg_unused(cache);
|
2015-11-23 23:25:16 +08:00
|
|
|
|
2009-04-03 21:47:43 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2007-10-16 04:15:53 +08:00
|
|
|
total -= num_bytes;
|
|
|
|
bytenr += num_bytes;
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
|
|
|
|
/* Modified block groups are accounted for in the delayed_refs_rsv. */
|
|
|
|
btrfs_update_delayed_refs_rsv(trans);
|
|
|
|
return ret;
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
2008-03-25 03:01:59 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
|
2008-05-07 23:43:44 +08:00
|
|
|
{
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
struct btrfs_block_group_cache *cache;
|
2008-12-12 05:30:39 +08:00
|
|
|
u64 bytenr;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_lock(&fs_info->block_group_cache_lock);
|
|
|
|
bytenr = fs_info->first_logical_byte;
|
|
|
|
spin_unlock(&fs_info->block_group_cache_lock);
|
2012-12-27 17:01:23 +08:00
|
|
|
|
|
|
|
if (bytenr < (u64)-1)
|
|
|
|
return bytenr;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
cache = btrfs_lookup_first_block_group(fs_info, search_start);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
if (!cache)
|
2008-05-07 23:43:44 +08:00
|
|
|
return 0;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2008-12-12 05:30:39 +08:00
|
|
|
bytenr = cache->key.objectid;
|
2009-04-03 21:47:43 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2008-12-12 05:30:39 +08:00
|
|
|
|
|
|
|
return bytenr;
|
2008-05-07 23:43:44 +08:00
|
|
|
}
|
|
|
|
|
2019-03-20 19:12:32 +08:00
|
|
|
static int pin_down_extent(struct btrfs_block_group_cache *cache,
|
2010-05-16 22:46:25 +08:00
|
|
|
u64 bytenr, u64 num_bytes, int reserved)
|
2007-11-17 03:57:08 +08:00
|
|
|
{
|
2019-03-20 19:12:32 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
spin_lock(&cache->space_info->lock);
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
cache->pinned += num_bytes;
|
2019-06-19 04:09:21 +08:00
|
|
|
btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
|
|
|
|
num_bytes);
|
2009-09-12 04:11:19 +08:00
|
|
|
if (reserved) {
|
|
|
|
cache->reserved -= num_bytes;
|
|
|
|
cache->space_info->bytes_reserved -= num_bytes;
|
|
|
|
}
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&cache->space_info->lock);
|
Btrfs: change how we unpin extents
We are racy with async block caching and unpinning extents. This patch makes
things much less complicated by only unpinning the extent if the block group is
cached. We check the block_group->cached var under the block_group->lock spin
lock. If it is set to BTRFS_CACHE_FINISHED then we update the pinned counters,
and unpin the extent and add the free space back. If it is not set to this, we
start the caching of the block group so the next time we unpin extents we can
unpin the extent. This keeps us from racing with the async caching threads,
lets us kill the fs wide async thread counter, and keeps us from having to set
DELALLOC bits for every extent we hit if there are caching kthreads going.
One thing that needed to be changed was btrfs_free_super_mirror_extents. Now
instead of just looking for LOCKED extents, we also look for DIRTY extents,
since we could have left some extents pinned in the previous transaction that
will never get freed now that we are unmounting, which would cause us to leak
memory. So btrfs_free_super_mirror_extents has been changed to
btrfs_free_pinned_extents, and it will clear the extents locked for the super
mirror, and any remaining pinned extents that may be present. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-28 01:57:01 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
trace_btrfs_space_reservation(fs_info, "pinned",
|
2016-03-26 01:25:54 +08:00
|
|
|
cache->space_info->flags, num_bytes, 1);
|
btrfs: use customized batch size for total_bytes_pinned
In commit b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly
pinned bytes") we use total_bytes_pinned to track how many bytes we are
going to free in this transaction. When we are close to ENOSPC, we check it
and know if we can make the allocation by commit the current transaction.
For every data/metadata extent we are going to free, we add
total_bytes_pinned in btrfs_free_extent() and btrfs_free_tree_block(), and
release it in unpin_extent_range() when we finish the transaction. So this
is a variable we frequently update but rarely read - just the suitable
use of percpu_counter. But in previous commit we update total_bytes_pinned
by default 32 batch size, making every update essentially a spin lock
protected update. Since every spin lock/unlock operation involves syncing
a globally used variable and some kind of barrier in a SMP system, this is
more expensive than using total_bytes_pinned as a simple atomic64_t.
So fix this by using a customized batch size. Since we only read
total_bytes_pinned when we are close to ENOSPC and fail to allocate new
chunk, we can use a really large batch size and have nearly no penalty
in most cases.
[Test]
We tested the patch on a 4-cores x86 machine:
1. fallocate a 16GiB size test file
2. take snapshot (so all following writes will be COW)
3. run a 180 sec, 4 jobs, 4K random write fio on test file
We also added a temporary lockdep class on percpu_counter's spin lock
used by total_bytes_pinned to track it by lock_stat.
[Results]
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 82 82
0.21 0.61 29.46 0.36 298340
635973 0.09 11.01 173476.25 0.27
patched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 1 1
0.62 0.62 0.62 0.62 13601
31542 0.14 9.61 11016.90 0.35
[Analysis]
Since the spin lock only protects a single in-memory variable, the
contentions (number of lock acquisitions that had to wait) in both
unpatched and patched version are low. But when we see acquisitions and
acq-bounces, we get much lower counts in patched version. Here the most
important metric is acq-bounces. It means how many times the lock gets
transferred between different cpus, so the patch can really reduce
cacheline bouncing of spin lock (also the global counter of percpu_counter)
in a SMP system.
Fixes: b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly pinned bytes")
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-07-13 16:50:42 +08:00
|
|
|
percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
|
|
|
|
num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
2016-06-23 06:54:23 +08:00
|
|
|
set_extent_dirty(fs_info->pinned_extents, bytenr,
|
2010-05-16 22:46:25 +08:00
|
|
|
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
|
|
|
|
return 0;
|
|
|
|
}
|
Btrfs: change how we unpin extents
We are racy with async block caching and unpinning extents. This patch makes
things much less complicated by only unpinning the extent if the block group is
cached. We check the block_group->cached var under the block_group->lock spin
lock. If it is set to BTRFS_CACHE_FINISHED then we update the pinned counters,
and unpin the extent and add the free space back. If it is not set to this, we
start the caching of the block group so the next time we unpin extents we can
unpin the extent. This keeps us from racing with the async caching threads,
lets us kill the fs wide async thread counter, and keeps us from having to set
DELALLOC bits for every extent we hit if there are caching kthreads going.
One thing that needed to be changed was btrfs_free_super_mirror_extents. Now
instead of just looking for LOCKED extents, we also look for DIRTY extents,
since we could have left some extents pinned in the previous transaction that
will never get freed now that we are unmounting, which would cause us to leak
memory. So btrfs_free_super_mirror_extents has been changed to
btrfs_free_pinned_extents, and it will clear the extents locked for the super
mirror, and any remaining pinned extents that may be present. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-28 01:57:01 +08:00
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
/*
|
|
|
|
* this function must be called within transaction
|
|
|
|
*/
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
|
2010-05-16 22:46:25 +08:00
|
|
|
u64 bytenr, u64 num_bytes, int reserved)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache;
|
Btrfs: change how we unpin extents
We are racy with async block caching and unpinning extents. This patch makes
things much less complicated by only unpinning the extent if the block group is
cached. We check the block_group->cached var under the block_group->lock spin
lock. If it is set to BTRFS_CACHE_FINISHED then we update the pinned counters,
and unpin the extent and add the free space back. If it is not set to this, we
start the caching of the block group so the next time we unpin extents we can
unpin the extent. This keeps us from racing with the async caching threads,
lets us kill the fs wide async thread counter, and keeps us from having to set
DELALLOC bits for every extent we hit if there are caching kthreads going.
One thing that needed to be changed was btrfs_free_super_mirror_extents. Now
instead of just looking for LOCKED extents, we also look for DIRTY extents,
since we could have left some extents pinned in the previous transaction that
will never get freed now that we are unmounting, which would cause us to leak
memory. So btrfs_free_super_mirror_extents has been changed to
btrfs_free_pinned_extents, and it will clear the extents locked for the super
mirror, and any remaining pinned extents that may be present. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-28 01:57:01 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, bytenr);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!cache); /* Logic error */
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2019-03-20 19:12:32 +08:00
|
|
|
pin_down_extent(cache, bytenr, num_bytes, reserved);
|
2010-05-16 22:46:25 +08:00
|
|
|
|
|
|
|
btrfs_put_block_group(cache);
|
2009-09-12 04:11:19 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
/*
|
2011-11-01 08:52:39 +08:00
|
|
|
* this function must be called within transaction
|
|
|
|
*/
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
|
2011-11-01 08:52:39 +08:00
|
|
|
u64 bytenr, u64 num_bytes)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache;
|
2013-04-26 03:55:30 +08:00
|
|
|
int ret;
|
2011-11-01 08:52:39 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, bytenr);
|
2013-04-26 03:55:30 +08:00
|
|
|
if (!cache)
|
|
|
|
return -EINVAL;
|
2011-11-01 08:52:39 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* pull in the free space cache (if any) so that our pin
|
|
|
|
* removes the free space from the cache. We have load_only set
|
|
|
|
* to one because the slow code to read in the free extents does check
|
|
|
|
* the pinned extents.
|
|
|
|
*/
|
2012-12-27 17:01:18 +08:00
|
|
|
cache_block_group(cache, 1);
|
2011-11-01 08:52:39 +08:00
|
|
|
|
2019-03-20 19:12:32 +08:00
|
|
|
pin_down_extent(cache, bytenr, num_bytes, 0);
|
2011-11-01 08:52:39 +08:00
|
|
|
|
|
|
|
/* remove us from the free space cache (if we're there at all) */
|
2013-04-26 03:55:30 +08:00
|
|
|
ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
|
2011-11-01 08:52:39 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2013-04-26 03:55:30 +08:00
|
|
|
return ret;
|
2011-11-01 08:52:39 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 start, u64 num_bytes)
|
2013-06-07 01:19:32 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
struct btrfs_caching_control *caching_ctl;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
block_group = btrfs_lookup_block_group(fs_info, start);
|
2013-06-07 01:19:32 +08:00
|
|
|
if (!block_group)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
cache_block_group(block_group, 0);
|
|
|
|
caching_ctl = get_caching_control(block_group);
|
|
|
|
|
|
|
|
if (!caching_ctl) {
|
|
|
|
/* Logic error */
|
|
|
|
BUG_ON(!block_group_cache_done(block_group));
|
|
|
|
ret = btrfs_remove_free_space(block_group, start, num_bytes);
|
|
|
|
} else {
|
|
|
|
mutex_lock(&caching_ctl->mutex);
|
|
|
|
|
|
|
|
if (start >= caching_ctl->progress) {
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = add_excluded_extent(fs_info, start, num_bytes);
|
2013-06-07 01:19:32 +08:00
|
|
|
} else if (start + num_bytes <= caching_ctl->progress) {
|
|
|
|
ret = btrfs_remove_free_space(block_group,
|
|
|
|
start, num_bytes);
|
|
|
|
} else {
|
|
|
|
num_bytes = caching_ctl->progress - start;
|
|
|
|
ret = btrfs_remove_free_space(block_group,
|
|
|
|
start, num_bytes);
|
|
|
|
if (ret)
|
|
|
|
goto out_lock;
|
|
|
|
|
|
|
|
num_bytes = (start + num_bytes) -
|
|
|
|
caching_ctl->progress;
|
|
|
|
start = caching_ctl->progress;
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = add_excluded_extent(fs_info, start, num_bytes);
|
2013-06-07 01:19:32 +08:00
|
|
|
}
|
|
|
|
out_lock:
|
|
|
|
mutex_unlock(&caching_ctl->mutex);
|
|
|
|
put_caching_control(caching_ctl);
|
|
|
|
}
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-03-20 19:14:33 +08:00
|
|
|
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
|
2013-06-07 01:19:32 +08:00
|
|
|
{
|
2019-03-20 19:14:33 +08:00
|
|
|
struct btrfs_fs_info *fs_info = eb->fs_info;
|
2013-06-07 01:19:32 +08:00
|
|
|
struct btrfs_file_extent_item *item;
|
|
|
|
struct btrfs_key key;
|
|
|
|
int found_type;
|
|
|
|
int i;
|
2018-05-22 17:46:51 +08:00
|
|
|
int ret = 0;
|
2013-06-07 01:19:32 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
|
2013-06-07 01:19:32 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
for (i = 0; i < btrfs_header_nritems(eb); i++) {
|
|
|
|
btrfs_item_key_to_cpu(eb, &key, i);
|
|
|
|
if (key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
continue;
|
|
|
|
item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
|
|
|
|
found_type = btrfs_file_extent_type(eb, item);
|
|
|
|
if (found_type == BTRFS_FILE_EXTENT_INLINE)
|
|
|
|
continue;
|
|
|
|
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
|
|
|
|
continue;
|
|
|
|
key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
|
|
|
|
key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
|
2018-05-22 17:46:51 +08:00
|
|
|
ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
|
|
|
|
if (ret)
|
|
|
|
break;
|
2013-06-07 01:19:32 +08:00
|
|
|
}
|
|
|
|
|
2018-05-22 17:46:51 +08:00
|
|
|
return ret;
|
2013-06-07 01:19:32 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 22:39:32 +08:00
|
|
|
static void
|
|
|
|
btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
|
|
|
|
{
|
|
|
|
atomic_inc(&bg->reservations);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
|
|
|
|
const u64 start)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *bg;
|
|
|
|
|
|
|
|
bg = btrfs_lookup_block_group(fs_info, start);
|
|
|
|
ASSERT(bg);
|
|
|
|
if (atomic_dec_and_test(&bg->reservations))
|
2018-03-15 18:43:08 +08:00
|
|
|
wake_up_var(&bg->reservations);
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 22:39:32 +08:00
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
|
|
|
|
{
|
|
|
|
struct btrfs_space_info *space_info = bg->space_info;
|
|
|
|
|
|
|
|
ASSERT(bg->ro);
|
|
|
|
|
|
|
|
if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Our block group is read only but before we set it to read only,
|
|
|
|
* some task might have had allocated an extent from it already, but it
|
|
|
|
* has not yet created a respective ordered extent (and added it to a
|
|
|
|
* root's list of ordered extents).
|
|
|
|
* Therefore wait for any task currently allocating extents, since the
|
|
|
|
* block group's reservations counter is incremented while a read lock
|
|
|
|
* on the groups' semaphore is held and decremented after releasing
|
|
|
|
* the read access on that semaphore and creating the ordered extent.
|
|
|
|
*/
|
|
|
|
down_write(&space_info->groups_sem);
|
|
|
|
up_write(&space_info->groups_sem);
|
|
|
|
|
2018-03-15 18:43:08 +08:00
|
|
|
wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 22:39:32 +08:00
|
|
|
}
|
|
|
|
|
2011-07-27 05:00:46 +08:00
|
|
|
/**
|
2016-07-25 15:51:39 +08:00
|
|
|
* btrfs_add_reserved_bytes - update the block_group and space info counters
|
2011-07-27 05:00:46 +08:00
|
|
|
* @cache: The cache we are manipulating
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
* @ram_bytes: The number of bytes of file content, and will be same to
|
|
|
|
* @num_bytes except for the compress path.
|
2011-07-27 05:00:46 +08:00
|
|
|
* @num_bytes: The number of bytes in question
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
* @delalloc: The blocks are allocated for the delalloc write
|
2011-07-27 05:00:46 +08:00
|
|
|
*
|
2016-09-23 12:38:50 +08:00
|
|
|
* This is called by the allocator when it reserves space. If this is a
|
|
|
|
* reservation and the block group has become read only we cannot make the
|
|
|
|
* reservation and return -EAGAIN, otherwise this function always succeeds.
|
2010-05-16 22:46:25 +08:00
|
|
|
*/
|
2016-07-25 15:51:39 +08:00
|
|
|
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
u64 ram_bytes, u64 num_bytes, int delalloc)
|
2009-09-12 04:11:19 +08:00
|
|
|
{
|
2011-07-27 05:00:46 +08:00
|
|
|
struct btrfs_space_info *space_info = cache->space_info;
|
2010-05-16 22:46:25 +08:00
|
|
|
int ret = 0;
|
2012-03-12 23:03:00 +08:00
|
|
|
|
2011-07-27 05:00:46 +08:00
|
|
|
spin_lock(&space_info->lock);
|
|
|
|
spin_lock(&cache->lock);
|
2016-07-25 15:51:39 +08:00
|
|
|
if (cache->ro) {
|
|
|
|
ret = -EAGAIN;
|
2011-07-27 05:00:46 +08:00
|
|
|
} else {
|
2016-07-25 15:51:39 +08:00
|
|
|
cache->reserved += num_bytes;
|
|
|
|
space_info->bytes_reserved += num_bytes;
|
2019-06-19 04:09:21 +08:00
|
|
|
btrfs_space_info_update_bytes_may_use(cache->fs_info,
|
|
|
|
space_info, -ram_bytes);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
if (delalloc)
|
2016-07-25 15:51:39 +08:00
|
|
|
cache->delalloc_bytes += num_bytes;
|
2007-11-17 03:57:08 +08:00
|
|
|
}
|
2011-07-27 05:00:46 +08:00
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&space_info->lock);
|
2010-05-16 22:46:25 +08:00
|
|
|
return ret;
|
2007-11-17 03:57:08 +08:00
|
|
|
}
|
2007-04-27 04:46:15 +08:00
|
|
|
|
2016-07-25 15:51:39 +08:00
|
|
|
/**
|
|
|
|
* btrfs_free_reserved_bytes - update the block_group and space info counters
|
|
|
|
* @cache: The cache we are manipulating
|
|
|
|
* @num_bytes: The number of bytes in question
|
|
|
|
* @delalloc: The blocks are allocated for the delalloc write
|
|
|
|
*
|
|
|
|
* This is called by somebody who is freeing space that was never actually used
|
|
|
|
* on disk. For example if you reserve some space for a new leaf in transaction
|
|
|
|
* A and before transaction A commits you free that leaf, you call this with
|
|
|
|
* reserve set to 0 in order to clear the reservation.
|
|
|
|
*/
|
|
|
|
|
2018-08-17 00:37:14 +08:00
|
|
|
static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
|
|
|
|
u64 num_bytes, int delalloc)
|
2016-07-25 15:51:39 +08:00
|
|
|
{
|
|
|
|
struct btrfs_space_info *space_info = cache->space_info;
|
|
|
|
|
|
|
|
spin_lock(&space_info->lock);
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
if (cache->ro)
|
|
|
|
space_info->bytes_readonly += num_bytes;
|
|
|
|
cache->reserved -= num_bytes;
|
|
|
|
space_info->bytes_reserved -= num_bytes;
|
2018-10-12 03:54:03 +08:00
|
|
|
space_info->max_extent_size = 0;
|
2016-07-25 15:51:39 +08:00
|
|
|
|
|
|
|
if (delalloc)
|
|
|
|
cache->delalloc_bytes -= num_bytes;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
}
|
2017-02-11 02:20:56 +08:00
|
|
|
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
|
2008-09-26 22:05:48 +08:00
|
|
|
{
|
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_caching_control *next;
|
|
|
|
struct btrfs_caching_control *caching_ctl;
|
|
|
|
struct btrfs_block_group_cache *cache;
|
2008-09-26 22:05:48 +08:00
|
|
|
|
2014-03-14 03:42:13 +08:00
|
|
|
down_write(&fs_info->commit_root_sem);
|
Btrfs: nuke fs wide allocation mutex V2
This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.
There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.
The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same. Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.
Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one. I have tested this heavily and it does
not appear to break anything. This has to be applied on top of my
find_free_extent redo patch.
I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
list_for_each_entry_safe(caching_ctl, next,
|
|
|
|
&fs_info->caching_block_groups, list) {
|
|
|
|
cache = caching_ctl->block_group;
|
|
|
|
if (block_group_cache_done(cache)) {
|
|
|
|
cache->last_byte_to_unpin = (u64)-1;
|
|
|
|
list_del_init(&caching_ctl->list);
|
|
|
|
put_caching_control(caching_ctl);
|
2008-09-26 22:05:48 +08:00
|
|
|
} else {
|
2009-09-12 04:11:19 +08:00
|
|
|
cache->last_byte_to_unpin = caching_ctl->progress;
|
2008-09-26 22:05:48 +08:00
|
|
|
}
|
|
|
|
}
|
2009-09-12 04:11:19 +08:00
|
|
|
|
|
|
|
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
|
|
|
|
fs_info->pinned_extents = &fs_info->freed_extents[1];
|
|
|
|
else
|
|
|
|
fs_info->pinned_extents = &fs_info->freed_extents[0];
|
|
|
|
|
2014-03-14 03:42:13 +08:00
|
|
|
up_write(&fs_info->commit_root_sem);
|
2010-05-16 22:49:58 +08:00
|
|
|
|
2019-06-20 01:47:23 +08:00
|
|
|
btrfs_update_global_block_rsv(fs_info);
|
2008-09-26 22:05:48 +08:00
|
|
|
}
|
|
|
|
|
2015-10-03 03:25:10 +08:00
|
|
|
/*
|
|
|
|
* Returns the free cluster for the given space info and sets empty_cluster to
|
|
|
|
* what it should be based on the mount options.
|
|
|
|
*/
|
|
|
|
static struct btrfs_free_cluster *
|
2016-06-23 06:54:24 +08:00
|
|
|
fetch_cluster_info(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_space_info *space_info, u64 *empty_cluster)
|
2015-10-03 03:25:10 +08:00
|
|
|
{
|
|
|
|
struct btrfs_free_cluster *ret = NULL;
|
|
|
|
|
|
|
|
*empty_cluster = 0;
|
|
|
|
if (btrfs_mixed_space_info(space_info))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = &fs_info->meta_alloc_cluster;
|
btrfs: Do not use data_alloc_cluster in ssd mode
This patch provides a band aid to improve the 'out of the box'
behaviour of btrfs for disks that are detected as being an ssd. In a
general purpose mixed workload scenario, the current ssd mode causes
overallocation of available raw disk space for data, while leaving
behind increasing amounts of unused fragmented free space. This
situation leads to early ENOSPC problems which are harming user
experience and adoption of btrfs as a general purpose filesystem.
This patch modifies the data extent allocation behaviour of the ssd mode
to make it behave identical to nossd mode. The metadata behaviour and
additional ssd_spread option stay untouched so far.
Recommendations for future development are to reconsider the current
oversimplified nossd / ssd distinction and the broken detection
mechanism based on the rotational attribute in sysfs and provide
experienced users with a more flexible way to choose allocator behaviour
for data and metadata, optimized for certain use cases, while keeping
sane 'out of the box' default settings. The internals of the current
btrfs code have more potential than what currently gets exposed to the
user to choose from.
The SSD story...
In the first year of btrfs development, around early 2008, btrfs
gained a mount option which enables specific functionality for
filesystems on solid state devices. The first occurance of this
functionality is in commit e18e4809, labeled "Add mount -o ssd, which
includes optimizations for seek free storage".
The effect on allocating free space for doing (data) writes is to
'cluster' writes together, writing them out in contiguous space, as
opposed to a 'tetris' way of putting all separate writes into any free
space fragment that fits (which is what the -o nossd behaviour does).
A somewhat simplified explanation of what happens is that, when for
example, the 'cluster' size is set to 2MiB, when we do some writes, the
data allocator will search for a free space block that is 2MiB big, and
put the writes in there. The ssd mode itself might allow a 2MiB cluster
to be composed of multiple free space extents with some existing data in
between, while the additional ssd_spread mount option kills off this
option and requires fully free space.
The idea behind this is (commit 536ac8ae): "The [...] clusters make it
more likely a given IO will completely overwrite the ssd block, so it
doesn't have to do an internal rwm cycle."; ssd block meaning nand erase
block. So, effectively this means applying a "locality based algorithm"
and trying to outsmart the actual ssd.
Since then, various changes have been made to the involved code, but the
basic idea is still present, and gets activated whenever the ssd mount
option is active. This also happens by default, when the rotational flag
as seen at /sys/block/<device>/queue/rotational is set to 0.
However, there's a number of problems with this approach.
First, what the optimization is trying to do is outsmart the ssd by
assuming there is a relation between the physical address space of the
block device as seen by btrfs and the actual physical storage of the
ssd, and then adjusting data placement. However, since the introduction
of the Flash Translation Layer (FTL) which is a part of the internal
controller of an ssd, these attempts are futile. The use of good quality
FTL in consumer ssd products might have been limited in 2008, but this
situation has changed drastically soon after that time. Today, even the
flash memory in your automatic cat feeding machine or your grandma's
wheelchair has a full featured one.
Second, the behaviour as described above results in the filesystem being
filled up with badly fragmented free space extents because of relatively
small pieces of space that are freed up by deletes, but not selected
again as part of a 'cluster'. Since the algorithm prefers allocating a
new chunk over going back to tetris mode, the end result is a filesystem
in which all raw space is allocated, but which is composed of
underutilized chunks with a 'shotgun blast' pattern of fragmented free
space. Usually, the next problematic thing that happens is the
filesystem wanting to allocate new space for metadata, which causes the
filesystem to fail in spectacular ways.
Third, the default mount options you get for an ssd ('ssd' mode enabled,
'discard' not enabled), in combination with spreading out writes over
the full address space and ignoring freed up space leads to worst case
behaviour in providing information to the ssd itself, since it will
never learn that all the free space left behind is actually free. There
are two ways to let an ssd know previously written data does not have to
be preserved, which are sending explicit signals using discard or
fstrim, or by simply overwriting the space with new data. The worst
case behaviour is the btrfs ssd_spread mount option in combination with
not having discard enabled. It has a side effect of minimizing the reuse
of free space previously written in.
Fourth, the rotational flag in /sys/ does not reliably indicate if the
device is a locally attached ssd. For example, iSCSI or NBD displays as
non-rotational, while a loop device on an ssd shows up as rotational.
The combination of the second and third problem effectively means that
despite all the good intentions, the btrfs ssd mode reliably causes the
ssd hardware and the filesystem structures and performance to be choked
to death. The clickbait version of the title of this story would have
been "Btrfs ssd optimizations considered harmful for ssds".
The current nossd 'tetris' mode (even still without discard) allows a
pattern of overwriting much more previously used space, causing many
more implicit discards to happen because of the overwrite information
the ssd gets. The actual location in the physical address space, as seen
from the point of view of btrfs is irrelevant, because the actual writes
to the low level flash are reordered anyway thanks to the FTL.
Changes made in the code
1. Make ssd mode data allocation identical to tetris mode, like nossd.
2. Adjust and clean up filesystem mount messages so that we can easily
identify if a kernel has this patch applied or not, when providing
support to end users. Also, make better use of the *_and_info helpers to
only trigger messages on actual state changes.
Backporting notes
Notes for whoever wants to backport this patch to their 4.9 LTS kernel:
* First apply commit 951e7966 "btrfs: drop the nossd flag when
remounting with -o ssd", or fixup the differences manually.
* The rest of the conflicts are because of the fs_info refactoring. So,
for example, instead of using fs_info, it's root->fs_info in
extent-tree.c
Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-07-28 14:31:28 +08:00
|
|
|
if (btrfs_test_opt(fs_info, SSD))
|
|
|
|
*empty_cluster = SZ_2M;
|
|
|
|
else
|
2015-12-15 00:42:10 +08:00
|
|
|
*empty_cluster = SZ_64K;
|
btrfs: Do not use data_alloc_cluster in ssd mode
This patch provides a band aid to improve the 'out of the box'
behaviour of btrfs for disks that are detected as being an ssd. In a
general purpose mixed workload scenario, the current ssd mode causes
overallocation of available raw disk space for data, while leaving
behind increasing amounts of unused fragmented free space. This
situation leads to early ENOSPC problems which are harming user
experience and adoption of btrfs as a general purpose filesystem.
This patch modifies the data extent allocation behaviour of the ssd mode
to make it behave identical to nossd mode. The metadata behaviour and
additional ssd_spread option stay untouched so far.
Recommendations for future development are to reconsider the current
oversimplified nossd / ssd distinction and the broken detection
mechanism based on the rotational attribute in sysfs and provide
experienced users with a more flexible way to choose allocator behaviour
for data and metadata, optimized for certain use cases, while keeping
sane 'out of the box' default settings. The internals of the current
btrfs code have more potential than what currently gets exposed to the
user to choose from.
The SSD story...
In the first year of btrfs development, around early 2008, btrfs
gained a mount option which enables specific functionality for
filesystems on solid state devices. The first occurance of this
functionality is in commit e18e4809, labeled "Add mount -o ssd, which
includes optimizations for seek free storage".
The effect on allocating free space for doing (data) writes is to
'cluster' writes together, writing them out in contiguous space, as
opposed to a 'tetris' way of putting all separate writes into any free
space fragment that fits (which is what the -o nossd behaviour does).
A somewhat simplified explanation of what happens is that, when for
example, the 'cluster' size is set to 2MiB, when we do some writes, the
data allocator will search for a free space block that is 2MiB big, and
put the writes in there. The ssd mode itself might allow a 2MiB cluster
to be composed of multiple free space extents with some existing data in
between, while the additional ssd_spread mount option kills off this
option and requires fully free space.
The idea behind this is (commit 536ac8ae): "The [...] clusters make it
more likely a given IO will completely overwrite the ssd block, so it
doesn't have to do an internal rwm cycle."; ssd block meaning nand erase
block. So, effectively this means applying a "locality based algorithm"
and trying to outsmart the actual ssd.
Since then, various changes have been made to the involved code, but the
basic idea is still present, and gets activated whenever the ssd mount
option is active. This also happens by default, when the rotational flag
as seen at /sys/block/<device>/queue/rotational is set to 0.
However, there's a number of problems with this approach.
First, what the optimization is trying to do is outsmart the ssd by
assuming there is a relation between the physical address space of the
block device as seen by btrfs and the actual physical storage of the
ssd, and then adjusting data placement. However, since the introduction
of the Flash Translation Layer (FTL) which is a part of the internal
controller of an ssd, these attempts are futile. The use of good quality
FTL in consumer ssd products might have been limited in 2008, but this
situation has changed drastically soon after that time. Today, even the
flash memory in your automatic cat feeding machine or your grandma's
wheelchair has a full featured one.
Second, the behaviour as described above results in the filesystem being
filled up with badly fragmented free space extents because of relatively
small pieces of space that are freed up by deletes, but not selected
again as part of a 'cluster'. Since the algorithm prefers allocating a
new chunk over going back to tetris mode, the end result is a filesystem
in which all raw space is allocated, but which is composed of
underutilized chunks with a 'shotgun blast' pattern of fragmented free
space. Usually, the next problematic thing that happens is the
filesystem wanting to allocate new space for metadata, which causes the
filesystem to fail in spectacular ways.
Third, the default mount options you get for an ssd ('ssd' mode enabled,
'discard' not enabled), in combination with spreading out writes over
the full address space and ignoring freed up space leads to worst case
behaviour in providing information to the ssd itself, since it will
never learn that all the free space left behind is actually free. There
are two ways to let an ssd know previously written data does not have to
be preserved, which are sending explicit signals using discard or
fstrim, or by simply overwriting the space with new data. The worst
case behaviour is the btrfs ssd_spread mount option in combination with
not having discard enabled. It has a side effect of minimizing the reuse
of free space previously written in.
Fourth, the rotational flag in /sys/ does not reliably indicate if the
device is a locally attached ssd. For example, iSCSI or NBD displays as
non-rotational, while a loop device on an ssd shows up as rotational.
The combination of the second and third problem effectively means that
despite all the good intentions, the btrfs ssd mode reliably causes the
ssd hardware and the filesystem structures and performance to be choked
to death. The clickbait version of the title of this story would have
been "Btrfs ssd optimizations considered harmful for ssds".
The current nossd 'tetris' mode (even still without discard) allows a
pattern of overwriting much more previously used space, causing many
more implicit discards to happen because of the overwrite information
the ssd gets. The actual location in the physical address space, as seen
from the point of view of btrfs is irrelevant, because the actual writes
to the low level flash are reordered anyway thanks to the FTL.
Changes made in the code
1. Make ssd mode data allocation identical to tetris mode, like nossd.
2. Adjust and clean up filesystem mount messages so that we can easily
identify if a kernel has this patch applied or not, when providing
support to end users. Also, make better use of the *_and_info helpers to
only trigger messages on actual state changes.
Backporting notes
Notes for whoever wants to backport this patch to their 4.9 LTS kernel:
* First apply commit 951e7966 "btrfs: drop the nossd flag when
remounting with -o ssd", or fixup the differences manually.
* The rest of the conflicts are because of the fs_info refactoring. So,
for example, instead of using fs_info, it's root->fs_info in
extent-tree.c
Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-07-28 14:31:28 +08:00
|
|
|
} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
|
|
|
|
btrfs_test_opt(fs_info, SSD_SPREAD)) {
|
|
|
|
*empty_cluster = SZ_2M;
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = &fs_info->data_alloc_cluster;
|
2015-10-03 03:25:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 start, u64 end,
|
Btrfs: fix fs corruption on transaction abort if device supports discard
When we abort a transaction we iterate over all the ranges marked as dirty
in fs_info->freed_extents[0] and fs_info->freed_extents[1], clear them
from those trees, add them back (unpin) to the free space caches and, if
the fs was mounted with "-o discard", perform a discard on those regions.
Also, after adding the regions to the free space caches, a fitrim ioctl call
can see those ranges in a block group's free space cache and perform a discard
on the ranges, so the same issue can happen without "-o discard" as well.
This causes corruption, affecting one or multiple btree nodes (in the worst
case leaving the fs unmountable) because some of those ranges (the ones in
the fs_info->pinned_extents tree) correspond to btree nodes/leafs that are
referred by the last committed super block - breaking the rule that anything
that was committed by a transaction is untouched until the next transaction
commits successfully.
I ran into this while running in a loop (for several hours) the fstest that
I recently submitted:
[PATCH] fstests: add btrfs test to stress chunk allocation/removal and fstrim
The corruption always happened when a transaction aborted and then fsck complained
like this:
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
*** fsck.btrfs output ***
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
read block failed check_tree_block
Couldn't open file system
In this case 94945280 corresponded to the root of a tree.
Using frace what I observed was the following sequence of steps happened:
1) transaction N started, fs_info->pinned_extents pointed to
fs_info->freed_extents[0];
2) node/eb 94945280 is created;
3) eb is persisted to disk;
4) transaction N commit starts, fs_info->pinned_extents now points to
fs_info->freed_extents[1], and transaction N completes;
5) transaction N + 1 starts;
6) eb is COWed, and btrfs_free_tree_block() called for this eb;
7) eb range (94945280 to 94945280 + 16Kb) is added to
fs_info->pinned_extents (fs_info->freed_extents[1]);
8) Something goes wrong in transaction N + 1, like hitting ENOSPC
for example, and the transaction is aborted, turning the fs into
readonly mode. The stack trace I got for example:
[112065.253935] [<ffffffff8140c7b6>] dump_stack+0x4d/0x66
[112065.254271] [<ffffffff81042984>] warn_slowpath_common+0x7f/0x98
[112065.254567] [<ffffffffa0325990>] ? __btrfs_abort_transaction+0x50/0x10b [btrfs]
[112065.261674] [<ffffffff810429e5>] warn_slowpath_fmt+0x48/0x50
[112065.261922] [<ffffffffa032949e>] ? btrfs_free_path+0x26/0x29 [btrfs]
[112065.262211] [<ffffffffa0325990>] __btrfs_abort_transaction+0x50/0x10b [btrfs]
[112065.262545] [<ffffffffa036b1d6>] btrfs_remove_chunk+0x537/0x58b [btrfs]
[112065.262771] [<ffffffffa033840f>] btrfs_delete_unused_bgs+0x1de/0x21b [btrfs]
[112065.263105] [<ffffffffa0343106>] cleaner_kthread+0x100/0x12f [btrfs]
(...)
[112065.264493] ---[ end trace dd7903a975a31a08 ]---
[112065.264673] BTRFS: error (device sdc) in btrfs_remove_chunk:2625: errno=-28 No space left
[112065.264997] BTRFS info (device sdc): forced readonly
9) The clear kthread sees that the BTRFS_FS_STATE_ERROR bit is set in
fs_info->fs_state and calls btrfs_cleanup_transaction(), which in
turn calls btrfs_destroy_pinned_extent();
10) Then btrfs_destroy_pinned_extent() iterates over all the ranges
marked as dirty in fs_info->freed_extents[], and for each one
it calls discard, if the fs was mounted with "-o discard", and
adds the range to the free space cache of the respective block
group;
11) btrfs_trim_block_group(), invoked from the fitrim ioctl code path,
sees the free space entries and performs a discard;
12) After an umount and mount (or fsck), our eb's location on disk was full
of zeroes, and it should have been untouched, because it was marked as
dirty in the fs_info->pinned_extents tree, and therefore used by the
trees that the last committed superblock points to.
Fix this by not performing a discard and not adding the ranges to the free space
caches - it's useless from this point since the fs is now in readonly mode and
we won't write free space caches to disk anymore (otherwise we would leak space)
nor any new superblock. By not adding the ranges to the free space caches, it
prevents other code paths from allocating that space and write to it as well,
therefore being safer and simpler.
This isn't a new problem, as it's been present since 2011 (git commit
acce952b0263825da32cf10489413dec78053347).
Cc: stable@vger.kernel.org # any kernel released after 2011-01-06
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-12-08 05:31:47 +08:00
|
|
|
const bool return_free_space)
|
2007-06-29 03:57:36 +08:00
|
|
|
{
|
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_block_group_cache *cache = NULL;
|
2012-10-23 03:52:28 +08:00
|
|
|
struct btrfs_space_info *space_info;
|
|
|
|
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
|
2015-10-03 03:25:10 +08:00
|
|
|
struct btrfs_free_cluster *cluster = NULL;
|
2009-09-12 04:11:19 +08:00
|
|
|
u64 len;
|
2015-10-03 03:25:10 +08:00
|
|
|
u64 total_unpinned = 0;
|
|
|
|
u64 empty_cluster = 0;
|
2012-10-23 03:52:28 +08:00
|
|
|
bool readonly;
|
2007-06-29 03:57:36 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
while (start <= end) {
|
2012-10-23 03:52:28 +08:00
|
|
|
readonly = false;
|
2009-09-12 04:11:19 +08:00
|
|
|
if (!cache ||
|
|
|
|
start >= cache->key.objectid + cache->key.offset) {
|
|
|
|
if (cache)
|
|
|
|
btrfs_put_block_group(cache);
|
2015-10-03 03:25:10 +08:00
|
|
|
total_unpinned = 0;
|
2009-09-12 04:11:19 +08:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, start);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!cache); /* Logic error */
|
2015-10-03 03:25:10 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
cluster = fetch_cluster_info(fs_info,
|
2015-10-03 03:25:10 +08:00
|
|
|
cache->space_info,
|
|
|
|
&empty_cluster);
|
|
|
|
empty_cluster <<= 1;
|
2009-09-12 04:11:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
len = cache->key.objectid + cache->key.offset - start;
|
|
|
|
len = min(len, end + 1 - start);
|
|
|
|
|
|
|
|
if (start < cache->last_byte_to_unpin) {
|
|
|
|
len = min(len, cache->last_byte_to_unpin - start);
|
Btrfs: fix fs corruption on transaction abort if device supports discard
When we abort a transaction we iterate over all the ranges marked as dirty
in fs_info->freed_extents[0] and fs_info->freed_extents[1], clear them
from those trees, add them back (unpin) to the free space caches and, if
the fs was mounted with "-o discard", perform a discard on those regions.
Also, after adding the regions to the free space caches, a fitrim ioctl call
can see those ranges in a block group's free space cache and perform a discard
on the ranges, so the same issue can happen without "-o discard" as well.
This causes corruption, affecting one or multiple btree nodes (in the worst
case leaving the fs unmountable) because some of those ranges (the ones in
the fs_info->pinned_extents tree) correspond to btree nodes/leafs that are
referred by the last committed super block - breaking the rule that anything
that was committed by a transaction is untouched until the next transaction
commits successfully.
I ran into this while running in a loop (for several hours) the fstest that
I recently submitted:
[PATCH] fstests: add btrfs test to stress chunk allocation/removal and fstrim
The corruption always happened when a transaction aborted and then fsck complained
like this:
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent
*** fsck.btrfs output ***
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
Check tree block failed, want=94945280, have=0
read block failed check_tree_block
Couldn't open file system
In this case 94945280 corresponded to the root of a tree.
Using frace what I observed was the following sequence of steps happened:
1) transaction N started, fs_info->pinned_extents pointed to
fs_info->freed_extents[0];
2) node/eb 94945280 is created;
3) eb is persisted to disk;
4) transaction N commit starts, fs_info->pinned_extents now points to
fs_info->freed_extents[1], and transaction N completes;
5) transaction N + 1 starts;
6) eb is COWed, and btrfs_free_tree_block() called for this eb;
7) eb range (94945280 to 94945280 + 16Kb) is added to
fs_info->pinned_extents (fs_info->freed_extents[1]);
8) Something goes wrong in transaction N + 1, like hitting ENOSPC
for example, and the transaction is aborted, turning the fs into
readonly mode. The stack trace I got for example:
[112065.253935] [<ffffffff8140c7b6>] dump_stack+0x4d/0x66
[112065.254271] [<ffffffff81042984>] warn_slowpath_common+0x7f/0x98
[112065.254567] [<ffffffffa0325990>] ? __btrfs_abort_transaction+0x50/0x10b [btrfs]
[112065.261674] [<ffffffff810429e5>] warn_slowpath_fmt+0x48/0x50
[112065.261922] [<ffffffffa032949e>] ? btrfs_free_path+0x26/0x29 [btrfs]
[112065.262211] [<ffffffffa0325990>] __btrfs_abort_transaction+0x50/0x10b [btrfs]
[112065.262545] [<ffffffffa036b1d6>] btrfs_remove_chunk+0x537/0x58b [btrfs]
[112065.262771] [<ffffffffa033840f>] btrfs_delete_unused_bgs+0x1de/0x21b [btrfs]
[112065.263105] [<ffffffffa0343106>] cleaner_kthread+0x100/0x12f [btrfs]
(...)
[112065.264493] ---[ end trace dd7903a975a31a08 ]---
[112065.264673] BTRFS: error (device sdc) in btrfs_remove_chunk:2625: errno=-28 No space left
[112065.264997] BTRFS info (device sdc): forced readonly
9) The clear kthread sees that the BTRFS_FS_STATE_ERROR bit is set in
fs_info->fs_state and calls btrfs_cleanup_transaction(), which in
turn calls btrfs_destroy_pinned_extent();
10) Then btrfs_destroy_pinned_extent() iterates over all the ranges
marked as dirty in fs_info->freed_extents[], and for each one
it calls discard, if the fs was mounted with "-o discard", and
adds the range to the free space cache of the respective block
group;
11) btrfs_trim_block_group(), invoked from the fitrim ioctl code path,
sees the free space entries and performs a discard;
12) After an umount and mount (or fsck), our eb's location on disk was full
of zeroes, and it should have been untouched, because it was marked as
dirty in the fs_info->pinned_extents tree, and therefore used by the
trees that the last committed superblock points to.
Fix this by not performing a discard and not adding the ranges to the free space
caches - it's useless from this point since the fs is now in readonly mode and
we won't write free space caches to disk anymore (otherwise we would leak space)
nor any new superblock. By not adding the ranges to the free space caches, it
prevents other code paths from allocating that space and write to it as well,
therefore being safer and simpler.
This isn't a new problem, as it's been present since 2011 (git commit
acce952b0263825da32cf10489413dec78053347).
Cc: stable@vger.kernel.org # any kernel released after 2011-01-06
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-12-08 05:31:47 +08:00
|
|
|
if (return_free_space)
|
|
|
|
btrfs_add_free_space(cache, start, len);
|
2009-09-12 04:11:19 +08:00
|
|
|
}
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
start += len;
|
2015-10-03 03:25:10 +08:00
|
|
|
total_unpinned += len;
|
2012-10-23 03:52:28 +08:00
|
|
|
space_info = cache->space_info;
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2015-10-03 03:25:10 +08:00
|
|
|
/*
|
|
|
|
* If this space cluster has been marked as fragmented and we've
|
|
|
|
* unpinned enough in this block group to potentially allow a
|
|
|
|
* cluster to be created inside of it go ahead and clear the
|
|
|
|
* fragmented check.
|
|
|
|
*/
|
|
|
|
if (cluster && cluster->fragmented &&
|
|
|
|
total_unpinned > empty_cluster) {
|
|
|
|
spin_lock(&cluster->lock);
|
|
|
|
cluster->fragmented = 0;
|
|
|
|
spin_unlock(&cluster->lock);
|
|
|
|
}
|
|
|
|
|
2012-10-23 03:52:28 +08:00
|
|
|
spin_lock(&space_info->lock);
|
2009-09-12 04:11:19 +08:00
|
|
|
spin_lock(&cache->lock);
|
|
|
|
cache->pinned -= len;
|
2019-06-19 04:09:21 +08:00
|
|
|
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
|
2016-03-26 01:25:54 +08:00
|
|
|
|
|
|
|
trace_btrfs_space_reservation(fs_info, "pinned",
|
|
|
|
space_info->flags, len, 0);
|
2015-09-29 23:40:47 +08:00
|
|
|
space_info->max_extent_size = 0;
|
btrfs: use customized batch size for total_bytes_pinned
In commit b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly
pinned bytes") we use total_bytes_pinned to track how many bytes we are
going to free in this transaction. When we are close to ENOSPC, we check it
and know if we can make the allocation by commit the current transaction.
For every data/metadata extent we are going to free, we add
total_bytes_pinned in btrfs_free_extent() and btrfs_free_tree_block(), and
release it in unpin_extent_range() when we finish the transaction. So this
is a variable we frequently update but rarely read - just the suitable
use of percpu_counter. But in previous commit we update total_bytes_pinned
by default 32 batch size, making every update essentially a spin lock
protected update. Since every spin lock/unlock operation involves syncing
a globally used variable and some kind of barrier in a SMP system, this is
more expensive than using total_bytes_pinned as a simple atomic64_t.
So fix this by using a customized batch size. Since we only read
total_bytes_pinned when we are close to ENOSPC and fail to allocate new
chunk, we can use a really large batch size and have nearly no penalty
in most cases.
[Test]
We tested the patch on a 4-cores x86 machine:
1. fallocate a 16GiB size test file
2. take snapshot (so all following writes will be COW)
3. run a 180 sec, 4 jobs, 4K random write fio on test file
We also added a temporary lockdep class on percpu_counter's spin lock
used by total_bytes_pinned to track it by lock_stat.
[Results]
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 82 82
0.21 0.61 29.46 0.36 298340
635973 0.09 11.01 173476.25 0.27
patched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 1 1
0.62 0.62 0.62 0.62 13601
31542 0.14 9.61 11016.90 0.35
[Analysis]
Since the spin lock only protects a single in-memory variable, the
contentions (number of lock acquisitions that had to wait) in both
unpatched and patched version are low. But when we see acquisitions and
acq-bounces, we get much lower counts in patched version. Here the most
important metric is acq-bounces. It means how many times the lock gets
transferred between different cpus, so the patch can really reduce
cacheline bouncing of spin lock (also the global counter of percpu_counter)
in a SMP system.
Fixes: b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly pinned bytes")
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-07-13 16:50:42 +08:00
|
|
|
percpu_counter_add_batch(&space_info->total_bytes_pinned,
|
|
|
|
-len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
2012-10-23 03:52:28 +08:00
|
|
|
if (cache->ro) {
|
|
|
|
space_info->bytes_readonly += len;
|
|
|
|
readonly = true;
|
|
|
|
}
|
2009-09-12 04:11:19 +08:00
|
|
|
spin_unlock(&cache->lock);
|
2016-05-18 01:30:55 +08:00
|
|
|
if (!readonly && return_free_space &&
|
|
|
|
global_rsv->space_info == space_info) {
|
|
|
|
u64 to_add = len;
|
2017-08-17 15:52:28 +08:00
|
|
|
|
2012-10-23 03:52:28 +08:00
|
|
|
spin_lock(&global_rsv->lock);
|
|
|
|
if (!global_rsv->full) {
|
2016-05-18 01:30:55 +08:00
|
|
|
to_add = min(len, global_rsv->size -
|
|
|
|
global_rsv->reserved);
|
|
|
|
global_rsv->reserved += to_add;
|
2019-06-19 04:09:21 +08:00
|
|
|
btrfs_space_info_update_bytes_may_use(fs_info,
|
|
|
|
space_info, to_add);
|
2012-10-23 03:52:28 +08:00
|
|
|
if (global_rsv->reserved >= global_rsv->size)
|
|
|
|
global_rsv->full = 1;
|
2016-05-18 01:30:55 +08:00
|
|
|
trace_btrfs_space_reservation(fs_info,
|
|
|
|
"space_info",
|
|
|
|
space_info->flags,
|
|
|
|
to_add, 1);
|
|
|
|
len -= to_add;
|
2012-10-23 03:52:28 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&global_rsv->lock);
|
2016-05-18 01:30:55 +08:00
|
|
|
/* Add to any tickets we may have */
|
|
|
|
if (len)
|
2019-06-19 04:09:18 +08:00
|
|
|
btrfs_space_info_add_new_bytes(fs_info,
|
|
|
|
space_info, len);
|
2012-10-23 03:52:28 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&space_info->lock);
|
2007-06-29 03:57:36 +08:00
|
|
|
}
|
2009-09-12 04:11:19 +08:00
|
|
|
|
|
|
|
if (cache)
|
|
|
|
btrfs_put_block_group(cache);
|
2007-06-29 03:57:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-03-15 22:00:26 +08:00
|
|
|
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
|
2007-03-07 09:08:01 +08:00
|
|
|
{
|
2018-03-15 22:00:26 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-06-15 21:41:19 +08:00
|
|
|
struct btrfs_block_group_cache *block_group, *tmp;
|
|
|
|
struct list_head *deleted_bgs;
|
2009-09-12 04:11:19 +08:00
|
|
|
struct extent_io_tree *unpin;
|
2007-10-16 04:15:26 +08:00
|
|
|
u64 start;
|
|
|
|
u64 end;
|
2007-03-07 09:08:01 +08:00
|
|
|
int ret;
|
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
|
|
|
|
unpin = &fs_info->freed_extents[1];
|
|
|
|
else
|
|
|
|
unpin = &fs_info->freed_extents[0];
|
|
|
|
|
2015-06-15 21:41:19 +08:00
|
|
|
while (!trans->aborted) {
|
2018-11-16 21:04:44 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
|
2015-01-30 03:18:25 +08:00
|
|
|
mutex_lock(&fs_info->unused_bg_unpin_mutex);
|
2007-10-16 04:15:26 +08:00
|
|
|
ret = find_first_extent_bit(unpin, 0, &start, &end,
|
2018-11-16 21:04:44 +08:00
|
|
|
EXTENT_DIRTY, &cached_state);
|
2015-01-30 03:18:25 +08:00
|
|
|
if (ret) {
|
|
|
|
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
|
2007-03-07 09:08:01 +08:00
|
|
|
break;
|
2015-01-30 03:18:25 +08:00
|
|
|
}
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_test_opt(fs_info, DISCARD))
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_discard_extent(fs_info, start,
|
2011-03-24 18:24:27 +08:00
|
|
|
end + 1 - start, NULL);
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
|
2018-11-16 21:04:44 +08:00
|
|
|
clear_extent_dirty(unpin, start, end, &cached_state);
|
2016-06-23 06:54:24 +08:00
|
|
|
unpin_extent_range(fs_info, start, end, true);
|
2015-01-30 03:18:25 +08:00
|
|
|
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
|
2018-11-16 21:04:44 +08:00
|
|
|
free_extent_state(cached_state);
|
2009-03-13 23:00:37 +08:00
|
|
|
cond_resched();
|
2007-03-07 09:08:01 +08:00
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2015-06-15 21:41:19 +08:00
|
|
|
/*
|
|
|
|
* Transaction is finished. We don't need the lock anymore. We
|
|
|
|
* do need to clean up the block groups in case of a transaction
|
|
|
|
* abort.
|
|
|
|
*/
|
|
|
|
deleted_bgs = &trans->transaction->deleted_bgs;
|
|
|
|
list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
|
|
|
|
u64 trimmed = 0;
|
|
|
|
|
|
|
|
ret = -EROFS;
|
|
|
|
if (!trans->aborted)
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_discard_extent(fs_info,
|
2015-06-15 21:41:19 +08:00
|
|
|
block_group->key.objectid,
|
|
|
|
block_group->key.offset,
|
|
|
|
&trimmed);
|
|
|
|
|
|
|
|
list_del_init(&block_group->bg_list);
|
|
|
|
btrfs_put_block_group_trimming(block_group);
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
const char *errstr = btrfs_decode_error(ret);
|
|
|
|
btrfs_warn(fs_info,
|
2017-07-13 21:32:18 +08:00
|
|
|
"discard failed while removing blockgroup: errno=%d %s",
|
2015-06-15 21:41:19 +08:00
|
|
|
ret, errstr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-03-23 00:13:20 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
|
2018-06-20 20:48:57 +08:00
|
|
|
struct btrfs_delayed_ref_node *node, u64 parent,
|
|
|
|
u64 root_objectid, u64 owner_objectid,
|
|
|
|
u64 owner_offset, int refs_to_drop,
|
|
|
|
struct btrfs_delayed_extent_op *extent_op)
|
2007-03-07 09:08:01 +08:00
|
|
|
{
|
2018-06-20 20:48:57 +08:00
|
|
|
struct btrfs_fs_info *info = trans->fs_info;
|
2007-03-13 04:22:34 +08:00
|
|
|
struct btrfs_key key;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_path *path;
|
2007-03-21 08:35:03 +08:00
|
|
|
struct btrfs_root *extent_root = info->extent_root;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_extent_item *ei;
|
|
|
|
struct btrfs_extent_inline_ref *iref;
|
2007-03-07 09:08:01 +08:00
|
|
|
int ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int is_data;
|
2008-02-19 05:33:44 +08:00
|
|
|
int extent_slot = 0;
|
|
|
|
int found_extent = 0;
|
|
|
|
int num_to_del = 1;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u32 item_size;
|
|
|
|
u64 refs;
|
2015-03-17 16:59:47 +08:00
|
|
|
u64 bytenr = node->bytenr;
|
|
|
|
u64 num_bytes = node->num_bytes;
|
2014-05-14 08:30:47 +08:00
|
|
|
int last_ref = 0;
|
2016-06-23 06:54:23 +08:00
|
|
|
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
|
2007-03-08 00:50:24 +08:00
|
|
|
|
2007-04-02 23:20:42 +08:00
|
|
|
path = btrfs_alloc_path();
|
2007-06-23 02:16:25 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2007-04-05 22:38:44 +08:00
|
|
|
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
|
|
|
|
BUG_ON(!is_data && refs_to_drop != 1);
|
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (is_data)
|
2017-10-07 22:02:21 +08:00
|
|
|
skinny_metadata = false;
|
2013-03-08 03:22:04 +08:00
|
|
|
|
2018-06-20 20:48:52 +08:00
|
|
|
ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
|
|
|
|
parent, root_objectid, owner_objectid,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
owner_offset);
|
2007-12-11 22:25:06 +08:00
|
|
|
if (ret == 0) {
|
2008-02-19 05:33:44 +08:00
|
|
|
extent_slot = path->slots[0];
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
while (extent_slot >= 0) {
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
2008-02-19 05:33:44 +08:00
|
|
|
extent_slot);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (key.objectid != bytenr)
|
2008-02-19 05:33:44 +08:00
|
|
|
break;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (key.type == BTRFS_EXTENT_ITEM_KEY &&
|
|
|
|
key.offset == num_bytes) {
|
2008-02-19 05:33:44 +08:00
|
|
|
found_extent = 1;
|
|
|
|
break;
|
|
|
|
}
|
2013-03-08 03:22:04 +08:00
|
|
|
if (key.type == BTRFS_METADATA_ITEM_KEY &&
|
|
|
|
key.offset == owner_objectid) {
|
|
|
|
found_extent = 1;
|
|
|
|
break;
|
|
|
|
}
|
2008-02-19 05:33:44 +08:00
|
|
|
if (path->slots[0] - extent_slot > 5)
|
|
|
|
break;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
extent_slot--;
|
2008-02-19 05:33:44 +08:00
|
|
|
}
|
2018-06-21 14:45:00 +08:00
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
if (!found_extent) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
BUG_ON(iref);
|
2018-06-20 20:49:12 +08:00
|
|
|
ret = remove_extent_backref(trans, path, NULL,
|
2017-02-16 05:28:27 +08:00
|
|
|
refs_to_drop,
|
2014-05-14 08:30:47 +08:00
|
|
|
is_data, &last_ref);
|
2012-09-18 21:52:32 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
key.objectid = bytenr;
|
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
key.offset = num_bytes;
|
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (!is_data && skinny_metadata) {
|
|
|
|
key.type = BTRFS_METADATA_ITEM_KEY;
|
|
|
|
key.offset = owner_objectid;
|
|
|
|
}
|
|
|
|
|
2008-09-24 01:14:14 +08:00
|
|
|
ret = btrfs_search_slot(trans, extent_root,
|
|
|
|
&key, path, -1, 1);
|
2013-03-08 03:22:04 +08:00
|
|
|
if (ret > 0 && skinny_metadata && path->slots[0]) {
|
|
|
|
/*
|
|
|
|
* Couldn't find our skinny metadata item,
|
|
|
|
* see if we have ye olde extent item.
|
|
|
|
*/
|
|
|
|
path->slots[0]--;
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
|
|
|
path->slots[0]);
|
|
|
|
if (key.objectid == bytenr &&
|
|
|
|
key.type == BTRFS_EXTENT_ITEM_KEY &&
|
|
|
|
key.offset == num_bytes)
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret > 0 && skinny_metadata) {
|
|
|
|
skinny_metadata = false;
|
2014-04-24 22:15:28 +08:00
|
|
|
key.objectid = bytenr;
|
2013-03-08 03:22:04 +08:00
|
|
|
key.type = BTRFS_EXTENT_ITEM_KEY;
|
|
|
|
key.offset = num_bytes;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_search_slot(trans, extent_root,
|
|
|
|
&key, path, -1, 1);
|
|
|
|
}
|
|
|
|
|
2008-11-13 03:19:50 +08:00
|
|
|
if (ret) {
|
2016-09-20 22:05:00 +08:00
|
|
|
btrfs_err(info,
|
|
|
|
"umm, got %d back from search, was looking for %llu",
|
|
|
|
ret, bytenr);
|
2011-07-13 23:03:50 +08:00
|
|
|
if (ret > 0)
|
2017-06-30 00:37:49 +08:00
|
|
|
btrfs_print_leaf(path->nodes[0]);
|
2008-11-13 03:19:50 +08:00
|
|
|
}
|
2012-09-18 21:52:32 +08:00
|
|
|
if (ret < 0) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2008-09-24 01:14:14 +08:00
|
|
|
extent_slot = path->slots[0];
|
|
|
|
}
|
2013-10-31 13:00:08 +08:00
|
|
|
} else if (WARN_ON(ret == -ENOENT)) {
|
2017-06-30 00:37:49 +08:00
|
|
|
btrfs_print_leaf(path->nodes[0]);
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_err(info,
|
|
|
|
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
|
2013-08-20 19:20:07 +08:00
|
|
|
bytenr, parent, root_objectid, owner_objectid,
|
|
|
|
owner_offset);
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2014-03-15 04:36:53 +08:00
|
|
|
goto out;
|
2012-03-12 23:03:00 +08:00
|
|
|
} else {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
2007-12-11 22:25:06 +08:00
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
item_size = btrfs_item_size_nr(leaf, extent_slot);
|
2018-06-26 22:20:59 +08:00
|
|
|
if (unlikely(item_size < sizeof(*ei))) {
|
2018-06-26 21:57:36 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
btrfs_print_v0_err(info);
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2008-02-19 05:33:44 +08:00
|
|
|
ei = btrfs_item_ptr(leaf, extent_slot,
|
2007-03-15 02:14:43 +08:00
|
|
|
struct btrfs_extent_item);
|
2013-03-08 03:22:04 +08:00
|
|
|
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
|
|
|
|
key.type == BTRFS_EXTENT_ITEM_KEY) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_tree_block_info *bi;
|
|
|
|
BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
|
|
|
|
bi = (struct btrfs_tree_block_info *)(ei + 1);
|
|
|
|
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
|
|
|
|
}
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
refs = btrfs_extent_refs(leaf, ei);
|
2013-04-25 04:38:50 +08:00
|
|
|
if (refs < refs_to_drop) {
|
2016-09-20 22:05:00 +08:00
|
|
|
btrfs_err(info,
|
|
|
|
"trying to drop %d refs but we only have %Lu for bytenr %Lu",
|
|
|
|
refs_to_drop, refs, bytenr);
|
2013-04-25 04:38:50 +08:00
|
|
|
ret = -EINVAL;
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-25 04:38:50 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2009-03-13 22:10:06 +08:00
|
|
|
refs -= refs_to_drop;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (refs > 0) {
|
|
|
|
if (extent_op)
|
|
|
|
__run_delayed_extent_op(extent_op, leaf, ei);
|
|
|
|
/*
|
|
|
|
* In the case of inline back ref, reference count will
|
|
|
|
* be updated by remove_extent_backref
|
2008-02-19 05:33:44 +08:00
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (iref) {
|
|
|
|
BUG_ON(!found_extent);
|
|
|
|
} else {
|
|
|
|
btrfs_set_extent_refs(leaf, ei, refs);
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
}
|
|
|
|
if (found_extent) {
|
2018-06-20 20:49:12 +08:00
|
|
|
ret = remove_extent_backref(trans, path, iref,
|
|
|
|
refs_to_drop, is_data,
|
|
|
|
&last_ref);
|
2012-09-18 21:52:32 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2008-02-19 05:33:44 +08:00
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
|
|
|
if (found_extent) {
|
|
|
|
BUG_ON(is_data && refs_to_drop !=
|
2015-08-06 22:16:24 +08:00
|
|
|
extent_data_ref_count(path, iref));
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (iref) {
|
|
|
|
BUG_ON(path->slots[0] != extent_slot);
|
|
|
|
} else {
|
|
|
|
BUG_ON(path->slots[0] != extent_slot + 1);
|
|
|
|
path->slots[0] = extent_slot;
|
|
|
|
num_to_del = 2;
|
|
|
|
}
|
2007-03-25 23:35:08 +08:00
|
|
|
}
|
2009-03-13 23:00:37 +08:00
|
|
|
|
2014-05-14 08:30:47 +08:00
|
|
|
last_ref = 1;
|
2008-02-19 05:33:44 +08:00
|
|
|
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
|
|
|
|
num_to_del);
|
2012-09-18 21:52:32 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2008-08-12 21:13:26 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (is_data) {
|
2016-06-21 22:40:19 +08:00
|
|
|
ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
|
2012-09-18 21:52:32 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2008-12-10 22:10:46 +08:00
|
|
|
}
|
|
|
|
|
2018-05-10 20:44:55 +08:00
|
|
|
ret = add_to_free_space_tree(trans, bytenr, num_bytes);
|
2015-09-30 11:50:37 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2015-09-30 11:50:37 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-03-20 19:10:15 +08:00
|
|
|
ret = update_block_group(trans, bytenr, num_bytes, 0);
|
2012-09-18 21:52:32 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-09-18 21:52:32 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2007-03-07 09:08:01 +08:00
|
|
|
}
|
2014-05-14 08:30:47 +08:00
|
|
|
btrfs_release_path(path);
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
out:
|
2007-04-02 23:20:42 +08:00
|
|
|
btrfs_free_path(path);
|
2007-03-07 09:08:01 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-03-13 22:11:24 +08:00
|
|
|
/*
|
2010-05-16 22:46:25 +08:00
|
|
|
* when we free an block, it is possible (and likely) that we free the last
|
2009-03-13 22:11:24 +08:00
|
|
|
* delayed ref for that extent as well. This searches the delayed ref tree for
|
|
|
|
* a given extent, and if there are no other delayed refs to be processed, it
|
|
|
|
* removes it from the tree.
|
|
|
|
*/
|
|
|
|
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
|
2016-06-23 06:54:24 +08:00
|
|
|
u64 bytenr)
|
2009-03-13 22:11:24 +08:00
|
|
|
{
|
|
|
|
struct btrfs_delayed_ref_head *head;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
2010-05-16 22:46:25 +08:00
|
|
|
int ret = 0;
|
2009-03-13 22:11:24 +08:00
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
|
|
spin_lock(&delayed_refs->lock);
|
2017-01-31 04:24:37 +08:00
|
|
|
head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
|
2009-03-13 22:11:24 +08:00
|
|
|
if (!head)
|
2014-01-29 23:02:40 +08:00
|
|
|
goto out_delayed_unlock;
|
2009-03-13 22:11:24 +08:00
|
|
|
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_lock(&head->lock);
|
2018-08-23 03:51:50 +08:00
|
|
|
if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
|
2009-03-13 22:11:24 +08:00
|
|
|
goto out;
|
|
|
|
|
2018-12-03 23:20:31 +08:00
|
|
|
if (cleanup_extent_op(head) != NULL)
|
|
|
|
goto out;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-03-13 22:11:24 +08:00
|
|
|
/*
|
|
|
|
* waiting for the lock here would deadlock. If someone else has it
|
|
|
|
* locked they are already in the process of dropping it anyway
|
|
|
|
*/
|
|
|
|
if (!mutex_trylock(&head->mutex))
|
|
|
|
goto out;
|
|
|
|
|
2018-12-03 23:20:29 +08:00
|
|
|
btrfs_delete_ref_head(delayed_refs, head);
|
2014-01-23 22:21:38 +08:00
|
|
|
head->processing = 0;
|
2018-12-03 23:20:29 +08:00
|
|
|
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_unlock(&head->lock);
|
2009-03-13 22:11:24 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
BUG_ON(head->extent_op);
|
|
|
|
if (head->must_insert_reserved)
|
|
|
|
ret = 1;
|
|
|
|
|
2018-11-22 03:05:41 +08:00
|
|
|
btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
|
2010-05-16 22:46:25 +08:00
|
|
|
mutex_unlock(&head->mutex);
|
2017-09-30 03:43:57 +08:00
|
|
|
btrfs_put_delayed_ref_head(head);
|
2010-05-16 22:46:25 +08:00
|
|
|
return ret;
|
2009-03-13 22:11:24 +08:00
|
|
|
out:
|
2014-01-23 22:21:38 +08:00
|
|
|
spin_unlock(&head->lock);
|
2014-01-29 23:02:40 +08:00
|
|
|
|
|
|
|
out_delayed_unlock:
|
2009-03-13 22:11:24 +08:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct extent_buffer *buf,
|
2012-05-16 23:04:52 +08:00
|
|
|
u64 parent, int last_ref)
|
2010-05-16 22:46:25 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2019-04-04 14:45:31 +08:00
|
|
|
struct btrfs_ref generic_ref = { 0 };
|
2013-06-20 03:00:04 +08:00
|
|
|
int pin = 1;
|
2010-05-16 22:46:25 +08:00
|
|
|
int ret;
|
|
|
|
|
2019-04-04 14:45:31 +08:00
|
|
|
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
|
|
|
|
buf->start, buf->len, parent);
|
|
|
|
btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
|
|
|
|
root->root_key.objectid);
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
|
2017-06-07 07:45:31 +08:00
|
|
|
int old_ref_mod, new_ref_mod;
|
|
|
|
|
2019-04-04 14:45:33 +08:00
|
|
|
btrfs_ref_tree_mod(fs_info, &generic_ref);
|
2019-04-04 14:45:31 +08:00
|
|
|
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
|
2017-06-07 07:45:31 +08:00
|
|
|
&old_ref_mod, &new_ref_mod);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2017-06-07 07:45:31 +08:00
|
|
|
pin = old_ref_mod >= 0 && new_ref_mod < 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
|
|
|
|
2017-06-07 07:45:29 +08:00
|
|
|
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
|
2015-01-07 04:18:45 +08:00
|
|
|
struct btrfs_block_group_cache *cache;
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = check_ref_cleanup(trans, buf->start);
|
2010-05-16 22:46:25 +08:00
|
|
|
if (!ret)
|
2011-08-05 22:25:38 +08:00
|
|
|
goto out;
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
|
|
|
|
2017-06-07 07:45:28 +08:00
|
|
|
pin = 0;
|
2016-06-23 06:54:23 +08:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, buf->start);
|
2015-01-07 04:18:45 +08:00
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
|
2019-03-20 19:12:32 +08:00
|
|
|
pin_down_extent(cache, buf->start, buf->len, 1);
|
2015-01-07 04:18:45 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2011-08-05 22:25:38 +08:00
|
|
|
goto out;
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
|
|
|
|
|
|
|
|
btrfs_add_free_space(cache, buf->start, buf->len);
|
2016-07-25 15:51:39 +08:00
|
|
|
btrfs_free_reserved_bytes(cache, buf->len, 0);
|
2015-01-07 04:18:45 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2016-09-07 04:00:42 +08:00
|
|
|
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
|
|
|
out:
|
2013-06-20 03:00:04 +08:00
|
|
|
if (pin)
|
2019-05-15 07:33:48 +08:00
|
|
|
add_pinned_bytes(fs_info, &generic_ref);
|
2013-06-20 03:00:04 +08:00
|
|
|
|
2017-06-07 07:45:29 +08:00
|
|
|
if (last_ref) {
|
|
|
|
/*
|
|
|
|
* Deleting the buffer, clear the corrupt flag since it doesn't
|
|
|
|
* matter anymore.
|
|
|
|
*/
|
|
|
|
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
|
|
|
|
}
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
/* Can return -ENOMEM */
|
2019-04-04 14:45:36 +08:00
|
|
|
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
|
2008-06-26 04:01:30 +08:00
|
|
|
{
|
2019-04-04 14:45:36 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2017-06-07 07:45:31 +08:00
|
|
|
int old_ref_mod, new_ref_mod;
|
2008-06-26 04:01:30 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-06-21 21:52:41 +08:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2014-05-08 05:06:09 +08:00
|
|
|
return 0;
|
2014-09-30 05:53:21 +08:00
|
|
|
|
2009-03-13 22:10:06 +08:00
|
|
|
/*
|
|
|
|
* tree log blocks never actually go into the extent allocation
|
|
|
|
* tree, just update pinning info and exit early.
|
|
|
|
*/
|
2019-04-04 14:45:36 +08:00
|
|
|
if ((ref->type == BTRFS_REF_METADATA &&
|
|
|
|
ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
|
|
|
|
(ref->type == BTRFS_REF_DATA &&
|
|
|
|
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
|
2009-03-13 23:00:37 +08:00
|
|
|
/* unlocks the pinned mutex */
|
2019-04-04 14:45:36 +08:00
|
|
|
btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
|
2017-06-07 07:45:31 +08:00
|
|
|
old_ref_mod = new_ref_mod = 0;
|
2009-03-13 22:10:06 +08:00
|
|
|
ret = 0;
|
2019-04-04 14:45:36 +08:00
|
|
|
} else if (ref->type == BTRFS_REF_METADATA) {
|
|
|
|
ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
|
2017-06-07 07:45:31 +08:00
|
|
|
&old_ref_mod, &new_ref_mod);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
2019-04-04 14:45:36 +08:00
|
|
|
ret = btrfs_add_delayed_data_ref(trans, ref, 0,
|
2017-06-07 07:45:31 +08:00
|
|
|
&old_ref_mod, &new_ref_mod);
|
2009-03-13 22:10:06 +08:00
|
|
|
}
|
2017-06-07 07:45:31 +08:00
|
|
|
|
2019-04-04 14:45:36 +08:00
|
|
|
if (!((ref->type == BTRFS_REF_METADATA &&
|
|
|
|
ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
|
|
|
|
(ref->type == BTRFS_REF_DATA &&
|
|
|
|
ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
|
|
|
|
btrfs_ref_tree_mod(fs_info, ref);
|
2019-04-04 14:45:33 +08:00
|
|
|
|
2019-04-04 14:45:34 +08:00
|
|
|
if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
|
2019-05-15 07:33:48 +08:00
|
|
|
add_pinned_bytes(fs_info, ref);
|
2017-06-07 07:45:31 +08:00
|
|
|
|
2008-06-26 04:01:30 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
/*
|
|
|
|
* when we wait for progress in the block group caching, its because
|
|
|
|
* our allocation attempt failed at least once. So, we must sleep
|
|
|
|
* and let some progress happen before we try again.
|
|
|
|
*
|
|
|
|
* This function will sleep at least once waiting for new free space to
|
|
|
|
* show up, and then it will check the block group free space numbers
|
|
|
|
* for our min num_bytes. Another option is to have it go ahead
|
|
|
|
* and look in the rbtree for a free extent of a given size, but this
|
|
|
|
* is a good start.
|
2013-08-05 23:15:21 +08:00
|
|
|
*
|
|
|
|
* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
|
|
|
|
* any of the information in this block group.
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
*/
|
2013-08-05 23:15:21 +08:00
|
|
|
static noinline void
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
|
|
|
|
u64 num_bytes)
|
|
|
|
{
|
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_caching_control *caching_ctl;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
caching_ctl = get_caching_control(cache);
|
|
|
|
if (!caching_ctl)
|
2013-08-05 23:15:21 +08:00
|
|
|
return;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2009-09-12 04:11:19 +08:00
|
|
|
wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
|
2011-03-29 13:46:06 +08:00
|
|
|
(cache->free_space_ctl->free_space >= num_bytes));
|
2009-09-12 04:11:19 +08:00
|
|
|
|
|
|
|
put_caching_control(caching_ctl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline int
|
|
|
|
wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
|
|
|
|
{
|
|
|
|
struct btrfs_caching_control *caching_ctl;
|
2013-08-05 23:15:21 +08:00
|
|
|
int ret = 0;
|
2009-09-12 04:11:19 +08:00
|
|
|
|
|
|
|
caching_ctl = get_caching_control(cache);
|
|
|
|
if (!caching_ctl)
|
2013-08-05 23:15:21 +08:00
|
|
|
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
|
2009-09-12 04:11:19 +08:00
|
|
|
|
|
|
|
wait_event(caching_ctl->wait, block_group_cache_done(cache));
|
2013-08-05 23:15:21 +08:00
|
|
|
if (cache->cached == BTRFS_CACHE_ERROR)
|
|
|
|
ret = -EIO;
|
2009-09-12 04:11:19 +08:00
|
|
|
put_caching_control(caching_ctl);
|
2013-08-05 23:15:21 +08:00
|
|
|
return ret;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
enum btrfs_loop_type {
|
2019-06-19 02:00:08 +08:00
|
|
|
LOOP_CACHING_NOWAIT,
|
|
|
|
LOOP_CACHING_WAIT,
|
|
|
|
LOOP_ALLOC_CHUNK,
|
|
|
|
LOOP_NO_EMPTY_SIZE,
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
};
|
|
|
|
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
static inline void
|
|
|
|
btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
|
|
|
|
int delalloc)
|
|
|
|
{
|
|
|
|
if (delalloc)
|
|
|
|
down_read(&cache->data_rwsem);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
|
|
|
|
int delalloc)
|
|
|
|
{
|
|
|
|
btrfs_get_block_group(cache);
|
|
|
|
if (delalloc)
|
|
|
|
down_read(&cache->data_rwsem);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct btrfs_block_group_cache *
|
|
|
|
btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
|
|
|
|
struct btrfs_free_cluster *cluster,
|
|
|
|
int delalloc)
|
|
|
|
{
|
2016-02-16 16:02:47 +08:00
|
|
|
struct btrfs_block_group_cache *used_bg = NULL;
|
2014-06-22 20:30:09 +08:00
|
|
|
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
spin_lock(&cluster->refill_lock);
|
2014-06-22 20:30:09 +08:00
|
|
|
while (1) {
|
|
|
|
used_bg = cluster->block_group;
|
|
|
|
if (!used_bg)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (used_bg == block_group)
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
return used_bg;
|
|
|
|
|
2014-06-22 20:30:09 +08:00
|
|
|
btrfs_get_block_group(used_bg);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2014-06-22 20:30:09 +08:00
|
|
|
if (!delalloc)
|
|
|
|
return used_bg;
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2014-06-22 20:30:09 +08:00
|
|
|
if (down_read_trylock(&used_bg->data_rwsem))
|
|
|
|
return used_bg;
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2014-06-22 20:30:09 +08:00
|
|
|
spin_unlock(&cluster->refill_lock);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2016-12-01 08:11:04 +08:00
|
|
|
/* We should only have one-level nested. */
|
|
|
|
down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2014-06-22 20:30:09 +08:00
|
|
|
spin_lock(&cluster->refill_lock);
|
|
|
|
if (used_bg == cluster->block_group)
|
|
|
|
return used_bg;
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
|
2014-06-22 20:30:09 +08:00
|
|
|
up_read(&used_bg->data_rwsem);
|
|
|
|
btrfs_put_block_group(used_bg);
|
|
|
|
}
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
btrfs_release_block_group(struct btrfs_block_group_cache *cache,
|
|
|
|
int delalloc)
|
|
|
|
{
|
|
|
|
if (delalloc)
|
|
|
|
up_read(&cache->data_rwsem);
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
}
|
|
|
|
|
2018-11-02 09:39:47 +08:00
|
|
|
/*
|
|
|
|
* Structure used internally for find_free_extent() function. Wraps needed
|
|
|
|
* parameters.
|
|
|
|
*/
|
|
|
|
struct find_free_extent_ctl {
|
|
|
|
/* Basic allocation info */
|
|
|
|
u64 ram_bytes;
|
|
|
|
u64 num_bytes;
|
|
|
|
u64 empty_size;
|
|
|
|
u64 flags;
|
|
|
|
int delalloc;
|
|
|
|
|
|
|
|
/* Where to start the search inside the bg */
|
|
|
|
u64 search_start;
|
|
|
|
|
|
|
|
/* For clustered allocation */
|
|
|
|
u64 empty_cluster;
|
|
|
|
|
|
|
|
bool have_caching_bg;
|
|
|
|
bool orig_have_caching_bg;
|
|
|
|
|
|
|
|
/* RAID index, converted from flags */
|
|
|
|
int index;
|
|
|
|
|
2018-11-02 09:39:50 +08:00
|
|
|
/*
|
|
|
|
* Current loop number, check find_free_extent_update_loop() for details
|
|
|
|
*/
|
2018-11-02 09:39:47 +08:00
|
|
|
int loop;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whether we're refilling a cluster, if true we need to re-search
|
|
|
|
* current block group but don't try to refill the cluster again.
|
|
|
|
*/
|
|
|
|
bool retry_clustered;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Whether we're updating free space cache, if true we need to re-search
|
|
|
|
* current block group but don't try updating free space cache again.
|
|
|
|
*/
|
|
|
|
bool retry_unclustered;
|
|
|
|
|
|
|
|
/* If current block group is cached */
|
|
|
|
int cached;
|
|
|
|
|
|
|
|
/* Max contiguous hole found */
|
|
|
|
u64 max_extent_size;
|
|
|
|
|
|
|
|
/* Total free space from free space cache, not always contiguous */
|
|
|
|
u64 total_free_space;
|
|
|
|
|
|
|
|
/* Found result */
|
|
|
|
u64 found_offset;
|
|
|
|
};
|
|
|
|
|
2018-11-02 09:39:48 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function for find_free_extent().
|
|
|
|
*
|
|
|
|
* Return -ENOENT to inform caller that we need fallback to unclustered mode.
|
|
|
|
* Return -EAGAIN to inform caller that we need to re-search this block group
|
|
|
|
* Return >0 to inform caller that we find nothing
|
|
|
|
* Return 0 means we have found a location and set ffe_ctl->found_offset.
|
|
|
|
*/
|
|
|
|
static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
|
|
|
|
struct btrfs_free_cluster *last_ptr,
|
|
|
|
struct find_free_extent_ctl *ffe_ctl,
|
|
|
|
struct btrfs_block_group_cache **cluster_bg_ret)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cluster_bg;
|
|
|
|
u64 aligned_cluster;
|
|
|
|
u64 offset;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
|
|
|
|
if (!cluster_bg)
|
|
|
|
goto refill_cluster;
|
|
|
|
if (cluster_bg != bg && (cluster_bg->ro ||
|
|
|
|
!block_group_bits(cluster_bg, ffe_ctl->flags)))
|
|
|
|
goto release_cluster;
|
|
|
|
|
|
|
|
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
|
|
|
|
ffe_ctl->num_bytes, cluster_bg->key.objectid,
|
|
|
|
&ffe_ctl->max_extent_size);
|
|
|
|
if (offset) {
|
|
|
|
/* We have a block, we're done */
|
|
|
|
spin_unlock(&last_ptr->refill_lock);
|
|
|
|
trace_btrfs_reserve_extent_cluster(cluster_bg,
|
|
|
|
ffe_ctl->search_start, ffe_ctl->num_bytes);
|
|
|
|
*cluster_bg_ret = cluster_bg;
|
|
|
|
ffe_ctl->found_offset = offset;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
WARN_ON(last_ptr->block_group != cluster_bg);
|
|
|
|
|
|
|
|
release_cluster:
|
|
|
|
/*
|
|
|
|
* If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
|
|
|
|
* lets just skip it and let the allocator find whatever block it can
|
|
|
|
* find. If we reach this point, we will have tried the cluster
|
|
|
|
* allocator plenty of times and not have found anything, so we are
|
|
|
|
* likely way too fragmented for the clustering stuff to find anything.
|
|
|
|
*
|
|
|
|
* However, if the cluster is taken from the current block group,
|
|
|
|
* release the cluster first, so that we stand a better chance of
|
|
|
|
* succeeding in the unclustered allocation.
|
|
|
|
*/
|
|
|
|
if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
|
|
|
|
spin_unlock(&last_ptr->refill_lock);
|
|
|
|
btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This cluster didn't work out, free it and start over */
|
|
|
|
btrfs_return_cluster_to_free_space(NULL, last_ptr);
|
|
|
|
|
|
|
|
if (cluster_bg != bg)
|
|
|
|
btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
|
|
|
|
|
|
|
|
refill_cluster:
|
|
|
|
if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
|
|
|
|
spin_unlock(&last_ptr->refill_lock);
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
aligned_cluster = max_t(u64,
|
|
|
|
ffe_ctl->empty_cluster + ffe_ctl->empty_size,
|
|
|
|
bg->full_stripe_len);
|
2019-03-20 20:53:49 +08:00
|
|
|
ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
|
|
|
|
ffe_ctl->num_bytes, aligned_cluster);
|
2018-11-02 09:39:48 +08:00
|
|
|
if (ret == 0) {
|
|
|
|
/* Now pull our allocation out of this cluster */
|
|
|
|
offset = btrfs_alloc_from_cluster(bg, last_ptr,
|
|
|
|
ffe_ctl->num_bytes, ffe_ctl->search_start,
|
|
|
|
&ffe_ctl->max_extent_size);
|
|
|
|
if (offset) {
|
|
|
|
/* We found one, proceed */
|
|
|
|
spin_unlock(&last_ptr->refill_lock);
|
|
|
|
trace_btrfs_reserve_extent_cluster(bg,
|
|
|
|
ffe_ctl->search_start,
|
|
|
|
ffe_ctl->num_bytes);
|
|
|
|
ffe_ctl->found_offset = offset;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
|
|
|
|
!ffe_ctl->retry_clustered) {
|
|
|
|
spin_unlock(&last_ptr->refill_lock);
|
|
|
|
|
|
|
|
ffe_ctl->retry_clustered = true;
|
|
|
|
wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
|
|
|
|
ffe_ctl->empty_cluster + ffe_ctl->empty_size);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* At this point we either didn't find a cluster or we weren't able to
|
|
|
|
* allocate a block from our cluster. Free the cluster we've been
|
|
|
|
* trying to use, and go to the next block group.
|
|
|
|
*/
|
|
|
|
btrfs_return_cluster_to_free_space(NULL, last_ptr);
|
|
|
|
spin_unlock(&last_ptr->refill_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-11-02 09:39:49 +08:00
|
|
|
/*
|
|
|
|
* Return >0 to inform caller that we find nothing
|
|
|
|
* Return 0 when we found an free extent and set ffe_ctrl->found_offset
|
|
|
|
* Return -EAGAIN to inform caller that we need to re-search this block group
|
|
|
|
*/
|
|
|
|
static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
|
|
|
|
struct btrfs_free_cluster *last_ptr,
|
|
|
|
struct find_free_extent_ctl *ffe_ctl)
|
|
|
|
{
|
|
|
|
u64 offset;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are doing an unclustered allocation, set the fragmented flag so
|
|
|
|
* we don't bother trying to setup a cluster again until we get more
|
|
|
|
* space.
|
|
|
|
*/
|
|
|
|
if (unlikely(last_ptr)) {
|
|
|
|
spin_lock(&last_ptr->lock);
|
|
|
|
last_ptr->fragmented = 1;
|
|
|
|
spin_unlock(&last_ptr->lock);
|
|
|
|
}
|
|
|
|
if (ffe_ctl->cached) {
|
|
|
|
struct btrfs_free_space_ctl *free_space_ctl;
|
|
|
|
|
|
|
|
free_space_ctl = bg->free_space_ctl;
|
|
|
|
spin_lock(&free_space_ctl->tree_lock);
|
|
|
|
if (free_space_ctl->free_space <
|
|
|
|
ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
|
|
|
|
ffe_ctl->empty_size) {
|
|
|
|
ffe_ctl->total_free_space = max_t(u64,
|
|
|
|
ffe_ctl->total_free_space,
|
|
|
|
free_space_ctl->free_space);
|
|
|
|
spin_unlock(&free_space_ctl->tree_lock);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
spin_unlock(&free_space_ctl->tree_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
|
|
|
|
ffe_ctl->num_bytes, ffe_ctl->empty_size,
|
|
|
|
&ffe_ctl->max_extent_size);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we didn't find a chunk, and we haven't failed on this block group
|
|
|
|
* before, and this block group is in the middle of caching and we are
|
|
|
|
* ok with waiting, then go ahead and wait for progress to be made, and
|
|
|
|
* set @retry_unclustered to true.
|
|
|
|
*
|
|
|
|
* If @retry_unclustered is true then we've already waited on this
|
|
|
|
* block group once and should move on to the next block group.
|
|
|
|
*/
|
|
|
|
if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
|
|
|
|
ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
|
|
|
|
wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
|
|
|
|
ffe_ctl->empty_size);
|
|
|
|
ffe_ctl->retry_unclustered = true;
|
|
|
|
return -EAGAIN;
|
|
|
|
} else if (!offset) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
ffe_ctl->found_offset = offset;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-02 09:39:50 +08:00
|
|
|
/*
|
|
|
|
* Return >0 means caller needs to re-search for free extent
|
|
|
|
* Return 0 means we have the needed free extent.
|
|
|
|
* Return <0 means we failed to locate any free extent.
|
|
|
|
*/
|
|
|
|
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_free_cluster *last_ptr,
|
|
|
|
struct btrfs_key *ins,
|
|
|
|
struct find_free_extent_ctl *ffe_ctl,
|
|
|
|
int full_search, bool use_cluster)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = fs_info->extent_root;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
|
|
|
|
ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
|
|
|
|
ffe_ctl->orig_have_caching_bg = true;
|
|
|
|
|
|
|
|
if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
|
|
|
|
ffe_ctl->have_caching_bg)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (ins->objectid) {
|
|
|
|
if (!use_cluster && last_ptr) {
|
|
|
|
spin_lock(&last_ptr->lock);
|
|
|
|
last_ptr->window_start = ins->objectid;
|
|
|
|
spin_unlock(&last_ptr->lock);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
|
|
|
|
* caching kthreads as we move along
|
|
|
|
* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
|
|
|
|
* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
|
|
|
|
* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
|
|
|
|
* again
|
|
|
|
*/
|
|
|
|
if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
|
|
|
|
ffe_ctl->index = 0;
|
|
|
|
if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
|
|
|
|
/*
|
|
|
|
* We want to skip the LOOP_CACHING_WAIT step if we
|
|
|
|
* don't have any uncached bgs and we've already done a
|
|
|
|
* full search through.
|
|
|
|
*/
|
|
|
|
if (ffe_ctl->orig_have_caching_bg || !full_search)
|
|
|
|
ffe_ctl->loop = LOOP_CACHING_WAIT;
|
|
|
|
else
|
|
|
|
ffe_ctl->loop = LOOP_ALLOC_CHUNK;
|
|
|
|
} else {
|
|
|
|
ffe_ctl->loop++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int exist = 0;
|
|
|
|
|
|
|
|
trans = current->journal_info;
|
|
|
|
if (trans)
|
|
|
|
exist = 1;
|
|
|
|
else
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-06-19 04:09:17 +08:00
|
|
|
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
|
|
|
|
CHUNK_ALLOC_FORCE);
|
2018-11-02 09:39:50 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we can't allocate a new chunk we've already looped
|
|
|
|
* through at least once, move on to the NO_EMPTY_SIZE
|
|
|
|
* case.
|
|
|
|
*/
|
|
|
|
if (ret == -ENOSPC)
|
|
|
|
ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
|
|
|
|
|
|
|
|
/* Do not bail out on ENOSPC since we can do more. */
|
|
|
|
if (ret < 0 && ret != -ENOSPC)
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
else
|
|
|
|
ret = 0;
|
|
|
|
if (!exist)
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
|
|
|
|
/*
|
|
|
|
* Don't loop again if we already have no empty_size and
|
|
|
|
* no empty_cluster.
|
|
|
|
*/
|
|
|
|
if (ffe_ctl->empty_size == 0 &&
|
|
|
|
ffe_ctl->empty_cluster == 0)
|
|
|
|
return -ENOSPC;
|
|
|
|
ffe_ctl->empty_size = 0;
|
|
|
|
ffe_ctl->empty_cluster = 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
|
2007-02-26 23:40:21 +08:00
|
|
|
/*
|
|
|
|
* walks the btree of allocated extents and find a hole of a given size.
|
|
|
|
* The key ins is changed to record the hole:
|
2013-09-09 13:19:42 +08:00
|
|
|
* ins->objectid == start position
|
2007-03-16 00:56:47 +08:00
|
|
|
* ins->flags = BTRFS_EXTENT_ITEM_KEY
|
2013-09-09 13:19:42 +08:00
|
|
|
* ins->offset == the size of the hole.
|
2007-02-26 23:40:21 +08:00
|
|
|
* Any available blocks before search_start are skipped.
|
2013-09-09 13:19:42 +08:00
|
|
|
*
|
|
|
|
* If there is no suitable free space, we will record the max size of
|
|
|
|
* the free space extent currently.
|
2018-11-02 09:39:50 +08:00
|
|
|
*
|
|
|
|
* The overall logic and call chain:
|
|
|
|
*
|
|
|
|
* find_free_extent()
|
|
|
|
* |- Iterate through all block groups
|
|
|
|
* | |- Get a valid block group
|
|
|
|
* | |- Try to do clustered allocation in that block group
|
|
|
|
* | |- Try to do unclustered allocation in that block group
|
|
|
|
* | |- Check if the result is valid
|
|
|
|
* | | |- If valid, then exit
|
|
|
|
* | |- Jump to next block group
|
|
|
|
* |
|
|
|
|
* |- Push harder to find free extents
|
|
|
|
* |- If not found, re-iterate all block groups
|
2007-02-26 23:40:21 +08:00
|
|
|
*/
|
2017-02-16 05:28:27 +08:00
|
|
|
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
u64 ram_bytes, u64 num_bytes, u64 empty_size,
|
|
|
|
u64 hint_byte, struct btrfs_key *ins,
|
|
|
|
u64 flags, int delalloc)
|
2007-02-26 23:40:21 +08:00
|
|
|
{
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
int ret = 0;
|
2009-04-03 21:47:43 +08:00
|
|
|
struct btrfs_free_cluster *last_ptr = NULL;
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
struct btrfs_block_group_cache *block_group = NULL;
|
2018-11-02 09:39:47 +08:00
|
|
|
struct find_free_extent_ctl ffe_ctl = {0};
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
struct btrfs_space_info *space_info;
|
2010-09-17 04:19:09 +08:00
|
|
|
bool use_cluster = true;
|
2015-10-02 02:54:10 +08:00
|
|
|
bool full_search = false;
|
2007-02-26 23:40:21 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
WARN_ON(num_bytes < fs_info->sectorsize);
|
2018-11-02 09:39:47 +08:00
|
|
|
|
|
|
|
ffe_ctl.ram_bytes = ram_bytes;
|
|
|
|
ffe_ctl.num_bytes = num_bytes;
|
|
|
|
ffe_ctl.empty_size = empty_size;
|
|
|
|
ffe_ctl.flags = flags;
|
|
|
|
ffe_ctl.search_start = 0;
|
|
|
|
ffe_ctl.retry_clustered = false;
|
|
|
|
ffe_ctl.retry_unclustered = false;
|
|
|
|
ffe_ctl.delalloc = delalloc;
|
|
|
|
ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
|
|
|
|
ffe_ctl.have_caching_bg = false;
|
|
|
|
ffe_ctl.orig_have_caching_bg = false;
|
|
|
|
ffe_ctl.found_offset = 0;
|
|
|
|
|
2014-06-05 00:41:45 +08:00
|
|
|
ins->type = BTRFS_EXTENT_ITEM_KEY;
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
ins->objectid = 0;
|
|
|
|
ins->offset = 0;
|
2007-04-05 03:27:52 +08:00
|
|
|
|
2016-09-07 04:00:42 +08:00
|
|
|
trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
|
2011-11-10 21:29:20 +08:00
|
|
|
|
2019-06-19 04:09:19 +08:00
|
|
|
space_info = btrfs_find_space_info(fs_info, flags);
|
2010-03-20 04:49:55 +08:00
|
|
|
if (!space_info) {
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_err(fs_info, "No space info for %llu", flags);
|
2010-03-20 04:49:55 +08:00
|
|
|
return -ENOSPC;
|
|
|
|
}
|
2009-04-03 22:14:19 +08:00
|
|
|
|
2010-09-17 04:19:09 +08:00
|
|
|
/*
|
2015-09-29 23:40:47 +08:00
|
|
|
* If our free space is heavily fragmented we may not be able to make
|
|
|
|
* big contiguous allocations, so instead of doing the expensive search
|
|
|
|
* for free space, simply return ENOSPC with our max_extent_size so we
|
|
|
|
* can go ahead and search for a more manageable chunk.
|
|
|
|
*
|
|
|
|
* If our max_extent_size is large enough for our allocation simply
|
|
|
|
* disable clustering since we will likely not be able to find enough
|
|
|
|
* space to create a cluster and induce latency trying.
|
2010-09-17 04:19:09 +08:00
|
|
|
*/
|
2015-09-29 23:40:47 +08:00
|
|
|
if (unlikely(space_info->max_extent_size)) {
|
|
|
|
spin_lock(&space_info->lock);
|
|
|
|
if (space_info->max_extent_size &&
|
|
|
|
num_bytes > space_info->max_extent_size) {
|
|
|
|
ins->offset = space_info->max_extent_size;
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
return -ENOSPC;
|
|
|
|
} else if (space_info->max_extent_size) {
|
|
|
|
use_cluster = false;
|
|
|
|
}
|
|
|
|
spin_unlock(&space_info->lock);
|
2009-04-03 21:47:43 +08:00
|
|
|
}
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2018-11-02 09:39:47 +08:00
|
|
|
last_ptr = fetch_cluster_info(fs_info, space_info,
|
|
|
|
&ffe_ctl.empty_cluster);
|
2008-03-25 03:02:07 +08:00
|
|
|
if (last_ptr) {
|
2009-04-03 21:47:43 +08:00
|
|
|
spin_lock(&last_ptr->lock);
|
|
|
|
if (last_ptr->block_group)
|
|
|
|
hint_byte = last_ptr->window_start;
|
2015-10-03 03:25:10 +08:00
|
|
|
if (last_ptr->fragmented) {
|
|
|
|
/*
|
|
|
|
* We still set window_start so we can keep track of the
|
|
|
|
* last place we found an allocation to try and save
|
|
|
|
* some time.
|
|
|
|
*/
|
|
|
|
hint_byte = last_ptr->window_start;
|
|
|
|
use_cluster = false;
|
|
|
|
}
|
2009-04-03 21:47:43 +08:00
|
|
|
spin_unlock(&last_ptr->lock);
|
2008-03-25 03:02:07 +08:00
|
|
|
}
|
2009-04-03 21:47:43 +08:00
|
|
|
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.search_start = max(ffe_ctl.search_start,
|
|
|
|
first_logical_byte(fs_info, 0));
|
|
|
|
ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
|
|
|
|
if (ffe_ctl.search_start == hint_byte) {
|
|
|
|
block_group = btrfs_lookup_block_group(fs_info,
|
|
|
|
ffe_ctl.search_start);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
/*
|
|
|
|
* we don't want to use the block group if it doesn't match our
|
|
|
|
* allocation bits, or if its not cached.
|
Btrfs: find ideal block group for caching
This patch changes a few things. Hopefully the comments are helpfull, but
I'll try and be as verbose here.
Problem:
My fedora box was taking 1 minute and 21 seconds to boot with btrfs as root.
Part of this problem was we pick the first block group we can find and start
caching it, even if it may not have enough free space. The other problem is
we only search for cached block groups the first time around, which we won't
find any cached block groups because this is a newly mounted fs, so we end up
caching several block groups during bootup, which with alot of fragmentation
takes around 30-45 seconds to complete, which bogs down the system. So
Solution:
1) Don't cache block groups willy-nilly at first. Instead try and figure out
which block group has the most free, and therefore will take the least amount
of time to cache.
2) Don't be so picky about cached block groups. The other problem is once
we've filled up a cluster, if the block group isn't finished caching the next
time we try and do the allocation we'll completely ignore the cluster and
start searching from the beginning of the space, which makes us cache more
block groups, which slows us down even more. So instead of skipping block
groups that are not finished caching when we have a hint, only skip the block
group if it hasn't started caching yet.
There is one other tweak in here. Before if we allocated a chunk and still
couldn't find new space, we'd end up switching the space info to force another
chunk allocation. This could make us end up with way too many chunks, so keep
track of this particular case.
With this patch and my previous cluster fixes my fedora box now boots in 43
seconds, and according to the bootchart is not held up by our block group
caching at all.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-11-11 10:23:48 +08:00
|
|
|
*
|
|
|
|
* However if we are re-searching with an ideal block group
|
|
|
|
* picked out then we don't care that the block group is cached.
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
*/
|
2013-04-29 21:39:40 +08:00
|
|
|
if (block_group && block_group_bits(block_group, flags) &&
|
2012-01-14 04:27:45 +08:00
|
|
|
block_group->cached != BTRFS_CACHE_NO) {
|
2009-04-03 22:14:19 +08:00
|
|
|
down_read(&space_info->groups_sem);
|
2009-06-05 03:34:51 +08:00
|
|
|
if (list_empty(&block_group->list) ||
|
|
|
|
block_group->ro) {
|
|
|
|
/*
|
|
|
|
* someone is removing this block group,
|
|
|
|
* we can't jump into the have_block_group
|
|
|
|
* target because our list pointers are not
|
|
|
|
* valid
|
|
|
|
*/
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
up_read(&space_info->groups_sem);
|
Btrfs: find ideal block group for caching
This patch changes a few things. Hopefully the comments are helpfull, but
I'll try and be as verbose here.
Problem:
My fedora box was taking 1 minute and 21 seconds to boot with btrfs as root.
Part of this problem was we pick the first block group we can find and start
caching it, even if it may not have enough free space. The other problem is
we only search for cached block groups the first time around, which we won't
find any cached block groups because this is a newly mounted fs, so we end up
caching several block groups during bootup, which with alot of fragmentation
takes around 30-45 seconds to complete, which bogs down the system. So
Solution:
1) Don't cache block groups willy-nilly at first. Instead try and figure out
which block group has the most free, and therefore will take the least amount
of time to cache.
2) Don't be so picky about cached block groups. The other problem is once
we've filled up a cluster, if the block group isn't finished caching the next
time we try and do the allocation we'll completely ignore the cluster and
start searching from the beginning of the space, which makes us cache more
block groups, which slows us down even more. So instead of skipping block
groups that are not finished caching when we have a hint, only skip the block
group if it hasn't started caching yet.
There is one other tweak in here. Before if we allocated a chunk and still
couldn't find new space, we'd end up switching the space info to force another
chunk allocation. This could make us end up with way too many chunks, so keep
track of this particular case.
With this patch and my previous cluster fixes my fedora box now boots in 43
seconds, and according to the bootchart is not held up by our block group
caching at all.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-11-11 10:23:48 +08:00
|
|
|
} else {
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.index = btrfs_bg_flags_to_raid_index(
|
2018-01-30 18:20:45 +08:00
|
|
|
block_group->flags);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_lock_block_group(block_group, delalloc);
|
2009-06-05 03:34:51 +08:00
|
|
|
goto have_block_group;
|
Btrfs: find ideal block group for caching
This patch changes a few things. Hopefully the comments are helpfull, but
I'll try and be as verbose here.
Problem:
My fedora box was taking 1 minute and 21 seconds to boot with btrfs as root.
Part of this problem was we pick the first block group we can find and start
caching it, even if it may not have enough free space. The other problem is
we only search for cached block groups the first time around, which we won't
find any cached block groups because this is a newly mounted fs, so we end up
caching several block groups during bootup, which with alot of fragmentation
takes around 30-45 seconds to complete, which bogs down the system. So
Solution:
1) Don't cache block groups willy-nilly at first. Instead try and figure out
which block group has the most free, and therefore will take the least amount
of time to cache.
2) Don't be so picky about cached block groups. The other problem is once
we've filled up a cluster, if the block group isn't finished caching the next
time we try and do the allocation we'll completely ignore the cluster and
start searching from the beginning of the space, which makes us cache more
block groups, which slows us down even more. So instead of skipping block
groups that are not finished caching when we have a hint, only skip the block
group if it hasn't started caching yet.
There is one other tweak in here. Before if we allocated a chunk and still
couldn't find new space, we'd end up switching the space info to force another
chunk allocation. This could make us end up with way too many chunks, so keep
track of this particular case.
With this patch and my previous cluster fixes my fedora box now boots in 43
seconds, and according to the bootchart is not held up by our block group
caching at all.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-11-11 10:23:48 +08:00
|
|
|
}
|
2009-04-03 22:14:19 +08:00
|
|
|
} else if (block_group) {
|
2009-04-03 21:47:43 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
2009-04-03 22:14:19 +08:00
|
|
|
}
|
2008-11-08 07:17:11 +08:00
|
|
|
}
|
2009-04-03 22:14:19 +08:00
|
|
|
search:
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.have_caching_bg = false;
|
|
|
|
if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
|
|
|
|
ffe_ctl.index == 0)
|
2015-10-02 02:54:10 +08:00
|
|
|
full_search = true;
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
down_read(&space_info->groups_sem);
|
2018-11-02 09:39:47 +08:00
|
|
|
list_for_each_entry(block_group,
|
|
|
|
&space_info->block_groups[ffe_ctl.index], list) {
|
btrfs: fix lockup in find_free_extent with read-only block groups
If we have a block group that is all of the following:
1) uncached in memory
2) is read-only
3) has a disk cache state that indicates we need to recreate the cache
AND the file system has enough free space fragmentation such that the
request for an extent of a given size can't be honored;
AND have a single CPU core;
AND it's the block group with the highest starting offset such that
there are no opportunities (like reading from disk) for the loop to
yield the CPU;
We can end up with a lockup.
The root cause is simple. Once we're in the position that we've read in
all of the other block groups directly and none of those block groups
can honor the request, there are no more opportunities to sleep. We end
up trying to start a caching thread which never gets run if we only have
one core. This *should* present as a hung task waiting on the caching
thread to make some progress, but it doesn't. Instead, it degrades into
a busy loop because of the placement of the read-only check.
During the first pass through the loop, block_group->cached will be set
to BTRFS_CACHE_STARTED and have_caching_bg will be set. Then we hit the
read-only check and short circuit the loop. We're not yet in
LOOP_CACHING_WAIT, so we skip that loop back before going through the
loop again for other raid groups.
Then we move to LOOP_CACHING_WAIT state.
During the this pass through the loop, ->cached will still be
BTRFS_CACHE_STARTED, which means it's not cached, so we'll enter
cache_block_group, do a lot of nothing, and return, and also set
have_caching_bg again. Then we hit the read-only check and short circuit
the loop. The same thing happens as before except now we DO trigger
the LOOP_CACHING_WAIT && have_caching_bg check and loop back up to the
top. We do this forever.
There are two fixes in this patch since they address the same underlying
bug.
The first is to add a cond_resched to the end of the loop to ensure
that the caching thread always has an opportunity to run. This will
fix the soft lockup issue, but find_free_extent will still loop doing
nothing until the thread has completed.
The second is to move the read-only check to the top of the loop. We're
never going to return an allocation within a read-only block group so
we may as well skip it early. The check for ->cached == BTRFS_CACHE_ERROR
would cause the same problem except that BTRFS_CACHE_ERROR is considered
a "done" state and we won't re-set have_caching_bg again.
Many thanks to Stephan Kulow <coolo@suse.de> for his excellent help in
the testing process.
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-07-20 11:25:51 +08:00
|
|
|
/* If the block group is read-only, we can skip it entirely. */
|
|
|
|
if (unlikely(block_group->ro))
|
|
|
|
continue;
|
|
|
|
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_grab_block_group(block_group, delalloc);
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.search_start = block_group->key.objectid;
|
2008-11-08 07:17:11 +08:00
|
|
|
|
2010-12-14 04:06:46 +08:00
|
|
|
/*
|
|
|
|
* this can happen if we end up cycling through all the
|
|
|
|
* raid types, but we want to make sure we only allocate
|
|
|
|
* for the proper type.
|
|
|
|
*/
|
2013-04-29 21:39:40 +08:00
|
|
|
if (!block_group_bits(block_group, flags)) {
|
2018-06-21 01:03:31 +08:00
|
|
|
u64 extra = BTRFS_BLOCK_GROUP_DUP |
|
2019-05-31 21:39:31 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID1_MASK |
|
2019-05-31 22:54:26 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID56_MASK |
|
2010-12-14 04:06:46 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID10;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* if they asked for extra copies and this block group
|
|
|
|
* doesn't provide them, bail. This does allow us to
|
|
|
|
* fill raid0 from raid1.
|
|
|
|
*/
|
2013-04-29 21:39:40 +08:00
|
|
|
if ((flags & extra) && !(block_group->flags & extra))
|
2010-12-14 04:06:46 +08:00
|
|
|
goto loop;
|
|
|
|
}
|
|
|
|
|
2009-04-03 22:14:19 +08:00
|
|
|
have_block_group:
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.cached = block_group_cache_done(block_group);
|
|
|
|
if (unlikely(!ffe_ctl.cached)) {
|
|
|
|
ffe_ctl.have_caching_bg = true;
|
2012-12-27 17:01:18 +08:00
|
|
|
ret = cache_block_group(block_group, 0);
|
2012-03-29 08:31:37 +08:00
|
|
|
BUG_ON(ret < 0);
|
|
|
|
ret = 0;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
|
|
|
|
2013-08-05 23:15:21 +08:00
|
|
|
if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
|
|
|
|
goto loop;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2009-09-12 04:11:20 +08:00
|
|
|
/*
|
2011-12-08 08:50:42 +08:00
|
|
|
* Ok we want to try and use the cluster allocator, so
|
|
|
|
* lets look there
|
2009-09-12 04:11:20 +08:00
|
|
|
*/
|
2015-10-03 03:25:10 +08:00
|
|
|
if (last_ptr && use_cluster) {
|
2018-11-02 09:39:48 +08:00
|
|
|
struct btrfs_block_group_cache *cluster_bg = NULL;
|
2009-04-03 21:47:43 +08:00
|
|
|
|
2018-11-02 09:39:48 +08:00
|
|
|
ret = find_free_extent_clustered(block_group, last_ptr,
|
|
|
|
&ffe_ctl, &cluster_bg);
|
2011-12-08 08:50:42 +08:00
|
|
|
|
2009-04-03 21:47:43 +08:00
|
|
|
if (ret == 0) {
|
2018-11-02 09:39:48 +08:00
|
|
|
if (cluster_bg && cluster_bg != block_group) {
|
|
|
|
btrfs_release_block_group(block_group,
|
|
|
|
delalloc);
|
|
|
|
block_group = cluster_bg;
|
2009-04-03 21:47:43 +08:00
|
|
|
}
|
2018-11-02 09:39:48 +08:00
|
|
|
goto checks;
|
|
|
|
} else if (ret == -EAGAIN) {
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
goto have_block_group;
|
2018-11-02 09:39:48 +08:00
|
|
|
} else if (ret > 0) {
|
|
|
|
goto loop;
|
2009-04-03 21:47:43 +08:00
|
|
|
}
|
2018-11-02 09:39:48 +08:00
|
|
|
/* ret == -ENOENT case falls through */
|
2009-04-03 21:47:43 +08:00
|
|
|
}
|
|
|
|
|
2018-11-02 09:39:49 +08:00
|
|
|
ret = find_free_extent_unclustered(block_group, last_ptr,
|
|
|
|
&ffe_ctl);
|
|
|
|
if (ret == -EAGAIN)
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
goto have_block_group;
|
2018-11-02 09:39:49 +08:00
|
|
|
else if (ret > 0)
|
2009-10-06 22:04:28 +08:00
|
|
|
goto loop;
|
2018-11-02 09:39:49 +08:00
|
|
|
/* ret == 0 case falls through */
|
2009-04-03 21:47:43 +08:00
|
|
|
checks:
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
|
|
|
|
fs_info->stripesize);
|
Btrfs: nuke fs wide allocation mutex V2
This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.
There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.
The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same. Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.
Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one. I have tested this heavily and it does
not appear to break anything. This has to be applied on top of my
find_free_extent redo patch.
I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-04-03 22:14:19 +08:00
|
|
|
/* move on to the next group */
|
2018-11-02 09:39:47 +08:00
|
|
|
if (ffe_ctl.search_start + num_bytes >
|
2014-01-15 20:00:56 +08:00
|
|
|
block_group->key.objectid + block_group->key.offset) {
|
2018-11-02 09:39:47 +08:00
|
|
|
btrfs_add_free_space(block_group, ffe_ctl.found_offset,
|
|
|
|
num_bytes);
|
2009-04-03 22:14:19 +08:00
|
|
|
goto loop;
|
2009-04-03 22:14:18 +08:00
|
|
|
}
|
2008-11-11 00:47:09 +08:00
|
|
|
|
2018-11-02 09:39:47 +08:00
|
|
|
if (ffe_ctl.found_offset < ffe_ctl.search_start)
|
|
|
|
btrfs_add_free_space(block_group, ffe_ctl.found_offset,
|
|
|
|
ffe_ctl.search_start - ffe_ctl.found_offset);
|
2009-04-03 22:14:19 +08:00
|
|
|
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
|
|
|
|
num_bytes, delalloc);
|
2010-05-16 22:46:25 +08:00
|
|
|
if (ret == -EAGAIN) {
|
2018-11-02 09:39:47 +08:00
|
|
|
btrfs_add_free_space(block_group, ffe_ctl.found_offset,
|
|
|
|
num_bytes);
|
2009-04-03 22:14:19 +08:00
|
|
|
goto loop;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
}
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 22:39:32 +08:00
|
|
|
btrfs_inc_block_group_reservations(block_group);
|
2008-03-25 03:01:56 +08:00
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
/* we are all good, lets return */
|
2018-11-02 09:39:47 +08:00
|
|
|
ins->objectid = ffe_ctl.search_start;
|
2009-04-03 22:14:19 +08:00
|
|
|
ins->offset = num_bytes;
|
2008-12-12 05:30:39 +08:00
|
|
|
|
2018-11-02 09:39:47 +08:00
|
|
|
trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
|
|
|
|
num_bytes);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_release_block_group(block_group, delalloc);
|
2009-04-03 22:14:19 +08:00
|
|
|
break;
|
|
|
|
loop:
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.retry_clustered = false;
|
|
|
|
ffe_ctl.retry_unclustered = false;
|
2018-01-30 18:20:45 +08:00
|
|
|
BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
|
2018-11-02 09:39:47 +08:00
|
|
|
ffe_ctl.index);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_release_block_group(block_group, delalloc);
|
btrfs: fix lockup in find_free_extent with read-only block groups
If we have a block group that is all of the following:
1) uncached in memory
2) is read-only
3) has a disk cache state that indicates we need to recreate the cache
AND the file system has enough free space fragmentation such that the
request for an extent of a given size can't be honored;
AND have a single CPU core;
AND it's the block group with the highest starting offset such that
there are no opportunities (like reading from disk) for the loop to
yield the CPU;
We can end up with a lockup.
The root cause is simple. Once we're in the position that we've read in
all of the other block groups directly and none of those block groups
can honor the request, there are no more opportunities to sleep. We end
up trying to start a caching thread which never gets run if we only have
one core. This *should* present as a hung task waiting on the caching
thread to make some progress, but it doesn't. Instead, it degrades into
a busy loop because of the placement of the read-only check.
During the first pass through the loop, block_group->cached will be set
to BTRFS_CACHE_STARTED and have_caching_bg will be set. Then we hit the
read-only check and short circuit the loop. We're not yet in
LOOP_CACHING_WAIT, so we skip that loop back before going through the
loop again for other raid groups.
Then we move to LOOP_CACHING_WAIT state.
During the this pass through the loop, ->cached will still be
BTRFS_CACHE_STARTED, which means it's not cached, so we'll enter
cache_block_group, do a lot of nothing, and return, and also set
have_caching_bg again. Then we hit the read-only check and short circuit
the loop. The same thing happens as before except now we DO trigger
the LOOP_CACHING_WAIT && have_caching_bg check and loop back up to the
top. We do this forever.
There are two fixes in this patch since they address the same underlying
bug.
The first is to add a cond_resched to the end of the loop to ensure
that the caching thread always has an opportunity to run. This will
fix the soft lockup issue, but find_free_extent will still loop doing
nothing until the thread has completed.
The second is to move the read-only check to the top of the loop. We're
never going to return an allocation within a read-only block group so
we may as well skip it early. The check for ->cached == BTRFS_CACHE_ERROR
would cause the same problem except that BTRFS_CACHE_ERROR is considered
a "done" state and we won't re-set have_caching_bg again.
Many thanks to Stephan Kulow <coolo@suse.de> for his excellent help in
the testing process.
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-07-20 11:25:51 +08:00
|
|
|
cond_resched();
|
2009-04-03 22:14:19 +08:00
|
|
|
}
|
|
|
|
up_read(&space_info->groups_sem);
|
|
|
|
|
2018-11-02 09:39:50 +08:00
|
|
|
ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
|
|
|
|
full_search, use_cluster);
|
|
|
|
if (ret > 0)
|
2010-05-16 22:46:24 +08:00
|
|
|
goto search;
|
|
|
|
|
2015-09-29 23:40:47 +08:00
|
|
|
if (ret == -ENOSPC) {
|
2018-11-02 09:39:47 +08:00
|
|
|
/*
|
|
|
|
* Use ffe_ctl->total_free_space as fallback if we can't find
|
|
|
|
* any contiguous hole.
|
|
|
|
*/
|
|
|
|
if (!ffe_ctl.max_extent_size)
|
|
|
|
ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
|
2015-09-29 23:40:47 +08:00
|
|
|
spin_lock(&space_info->lock);
|
2018-11-02 09:39:47 +08:00
|
|
|
space_info->max_extent_size = ffe_ctl.max_extent_size;
|
2015-09-29 23:40:47 +08:00
|
|
|
spin_unlock(&space_info->lock);
|
2018-11-02 09:39:47 +08:00
|
|
|
ins->offset = ffe_ctl.max_extent_size;
|
2015-09-29 23:40:47 +08:00
|
|
|
}
|
2007-03-01 05:46:22 +08:00
|
|
|
return ret;
|
2007-02-26 23:40:21 +08:00
|
|
|
}
|
2008-04-29 03:29:52 +08:00
|
|
|
|
2018-03-13 18:22:32 +08:00
|
|
|
/*
|
|
|
|
* btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
|
|
|
|
* hole that is at least as big as @num_bytes.
|
|
|
|
*
|
|
|
|
* @root - The root that will contain this extent
|
|
|
|
*
|
|
|
|
* @ram_bytes - The amount of space in ram that @num_bytes take. This
|
|
|
|
* is used for accounting purposes. This value differs
|
|
|
|
* from @num_bytes only in the case of compressed extents.
|
|
|
|
*
|
|
|
|
* @num_bytes - Number of bytes to allocate on-disk.
|
|
|
|
*
|
|
|
|
* @min_alloc_size - Indicates the minimum amount of space that the
|
|
|
|
* allocator should try to satisfy. In some cases
|
|
|
|
* @num_bytes may be larger than what is required and if
|
|
|
|
* the filesystem is fragmented then allocation fails.
|
|
|
|
* However, the presence of @min_alloc_size gives a
|
|
|
|
* chance to try and satisfy the smaller allocation.
|
|
|
|
*
|
|
|
|
* @empty_size - A hint that you plan on doing more COW. This is the
|
|
|
|
* size in bytes the allocator should try to find free
|
|
|
|
* next to the block it returns. This is just a hint and
|
|
|
|
* may be ignored by the allocator.
|
|
|
|
*
|
|
|
|
* @hint_byte - Hint to the allocator to start searching above the byte
|
|
|
|
* address passed. It might be ignored.
|
|
|
|
*
|
|
|
|
* @ins - This key is modified to record the found hole. It will
|
|
|
|
* have the following values:
|
|
|
|
* ins->objectid == start position
|
|
|
|
* ins->flags = BTRFS_EXTENT_ITEM_KEY
|
|
|
|
* ins->offset == the size of the hole.
|
|
|
|
*
|
|
|
|
* @is_data - Boolean flag indicating whether an extent is
|
|
|
|
* allocated for data (true) or metadata (false)
|
|
|
|
*
|
|
|
|
* @delalloc - Boolean flag indicating whether this allocation is for
|
|
|
|
* delalloc or not. If 'true' data_rwsem of block groups
|
|
|
|
* is going to be acquired.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Returns 0 when an allocation succeeded or < 0 when an error occurred. In
|
|
|
|
* case -ENOSPC is returned then @ins->offset will contain the size of the
|
|
|
|
* largest available hole the allocator managed to find.
|
|
|
|
*/
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
|
2009-09-12 04:11:19 +08:00
|
|
|
u64 num_bytes, u64 min_alloc_size,
|
|
|
|
u64 empty_size, u64 hint_byte,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
struct btrfs_key *ins, int is_data, int delalloc)
|
2007-02-26 23:40:21 +08:00
|
|
|
{
|
2016-09-20 22:05:02 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2015-09-26 04:13:11 +08:00
|
|
|
bool final_tried = num_bytes == min_alloc_size;
|
2013-04-29 21:39:40 +08:00
|
|
|
u64 flags;
|
2007-02-26 23:40:21 +08:00
|
|
|
int ret;
|
2008-06-26 04:01:30 +08:00
|
|
|
|
2017-05-17 23:38:35 +08:00
|
|
|
flags = get_alloc_profile_by_root(root, is_data);
|
2008-04-14 21:46:10 +08:00
|
|
|
again:
|
2016-06-23 06:54:23 +08:00
|
|
|
WARN_ON(num_bytes < fs_info->sectorsize);
|
2017-02-16 05:28:27 +08:00
|
|
|
ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
hint_byte, ins, flags, delalloc);
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 22:39:32 +08:00
|
|
|
if (!ret && !is_data) {
|
2016-09-20 22:05:02 +08:00
|
|
|
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
|
Btrfs: don't do unnecessary delalloc flushes when relocating
Before we start the actual relocation process of a block group, we do
calls to flush delalloc of all inodes and then wait for ordered extents
to complete. However we do these flush calls just to make sure we don't
race with concurrent tasks that have actually already started to run
delalloc and have allocated an extent from the block group we want to
relocate, right before we set it to readonly mode, but have not yet
created the respective ordered extents. The flush calls make us wait
for such concurrent tasks because they end up calling
filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() ->
__start_delalloc_inodes() -> btrfs_alloc_delalloc_work() ->
btrfs_run_delalloc_work()) which ends up serializing us with those tasks
due to attempts to lock the same pages (and the delalloc flush procedure
calls the allocator and creates the ordered extents before unlocking the
pages).
These flushing calls not only make us waste time (cpu, IO) but also reduce
the chances of writing larger extents (applications might be writing to
contiguous ranges and we flush before they finish dirtying the whole
ranges).
So make sure we don't flush delalloc and just wait for concurrent tasks
that have already started flushing delalloc and have allocated an extent
from the block group we are about to relocate.
This change also ends up fixing a race with direct IO writes that makes
relocation not wait for direct IO ordered extents. This race is
illustrated by the following diagram:
CPU 1 CPU 2
btrfs_relocate_block_group(bg X)
starts direct IO write,
target inode currently has no
ordered extents ongoing nor
dirty pages (delalloc regions),
therefore the root for our inode
is not in the list
fs_info->ordered_roots
btrfs_direct_IO()
__blockdev_direct_IO()
btrfs_get_blocks_direct()
btrfs_lock_extent_direct()
locks range in the io tree
btrfs_new_extent_direct()
btrfs_reserve_extent()
--> extent allocated
from bg X
btrfs_inc_block_group_ro(bg X)
btrfs_start_delalloc_roots()
__start_delalloc_inodes()
--> does nothing, no dealloc ranges
in the inode's io tree so the
inode's root is not in the list
fs_info->delalloc_roots
btrfs_wait_ordered_roots()
--> does not find the inode's root in the
list fs_info->ordered_roots
--> ends up not waiting for the direct IO
write started by the task at CPU 2
relocate_block_group(rc->stage ==
MOVE_DATA_EXTENTS)
prepare_to_relocate()
btrfs_commit_transaction()
iterates the extent tree, using its
commit root and moves extents into new
locations
btrfs_add_ordered_extent_dio()
--> now a ordered extent is
created and added to the
list root->ordered_extents
and the root added to the
list fs_info->ordered_roots
--> this is too late and the
task at CPU 1 already
started the relocation
btrfs_commit_transaction()
btrfs_finish_ordered_io()
btrfs_alloc_reserved_file_extent()
--> adds delayed data reference
for the extent allocated
from bg X
relocate_block_group(rc->stage ==
UPDATE_DATA_PTRS)
prepare_to_relocate()
btrfs_commit_transaction()
--> delayed refs are run, so an extent
item for the allocated extent from
bg X is added to extent tree
--> commit roots are switched, so the
next scan in the extent tree will
see the extent item
sees the extent in the extent tree
When this happens the relocation produces the following warning when it
finishes:
[ 7260.832836] ------------[ cut here ]------------
[ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]()
[ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1
[ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000
[ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d
[ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000
[ 7260.852998] Call Trace:
[ 7260.852998] [<ffffffff812648b3>] dump_stack+0x67/0x90
[ 7260.852998] [<ffffffff81051608>] warn_slowpath_common+0x99/0xb2
[ 7260.852998] [<ffffffffa03c1b2d>] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffff810516d4>] warn_slowpath_null+0x1a/0x1c
[ 7260.852998] [<ffffffffa03c1b2d>] btrfs_relocate_block_group+0x245/0x2a1 [btrfs]
[ 7260.852998] [<ffffffffa039d9de>] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs]
[ 7260.852998] [<ffffffffa039f314>] btrfs_balance+0xde1/0xe4e [btrfs]
[ 7260.852998] [<ffffffff8127d671>] ? debug_smp_processor_id+0x17/0x19
[ 7260.852998] [<ffffffffa03a9583>] btrfs_ioctl_balance+0x255/0x2d3 [btrfs]
[ 7260.852998] [<ffffffffa03ac96a>] btrfs_ioctl+0x11e0/0x1dff [btrfs]
[ 7260.852998] [<ffffffff811451df>] ? handle_mm_fault+0x443/0xd63
[ 7260.852998] [<ffffffff81491817>] ? _raw_spin_unlock+0x31/0x44
[ 7260.852998] [<ffffffff8108b36a>] ? arch_local_irq_save+0x9/0xc
[ 7260.852998] [<ffffffff811876ab>] vfs_ioctl+0x18/0x34
[ 7260.852998] [<ffffffff81187cb2>] do_vfs_ioctl+0x550/0x5be
[ 7260.852998] [<ffffffff81190c30>] ? __fget_light+0x4d/0x71
[ 7260.852998] [<ffffffff81187d77>] SyS_ioctl+0x57/0x79
[ 7260.852998] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7260.893268] ---[ end trace eb7803b24ebab8ad ]---
This is because at the end of the first stage, in relocate_block_group(),
we commit the current transaction, which makes delayed refs run, the
commit roots are switched and so the second stage will find the extent
item that the ordered extent added to the delayed refs. But this extent
was not moved (ordered extent completed after first stage finished), so
at the end of the relocation our block group item still has a positive
used bytes counter, triggering a warning at the end of
btrfs_relocate_block_group(). Later on when trying to read the extent
contents from disk we hit a BUG_ON() due to the inability to map a block
with a logical address that belongs to the block group we relocated and
is no longer valid, resulting in the following trace:
[ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096
[ 7344.887518] ------------[ cut here ]------------
[ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833!
[ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc
[ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1
[ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000
[ 7344.888431] RIP: 0010:[<ffffffffa037c88c>] [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282
[ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001
[ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff
[ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000
[ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000
[ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0
[ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000
[ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0
[ 7344.888431] Stack:
[ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950
[ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000
[ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000
[ 7344.888431] Call Trace:
[ 7344.888431] [<ffffffffa0395c8f>] submit_extent_page+0xf5/0x16f [btrfs]
[ 7344.888431] [<ffffffffa03970ac>] __do_readpage+0x4a0/0x4f1 [btrfs]
[ 7344.888431] [<ffffffffa039680d>] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8108df55>] ? trace_hardirqs_on+0xd/0xf
[ 7344.888431] [<ffffffffa039728c>] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffffa039739b>] __extent_readpages.constprop.25+0xed/0x100 [btrfs]
[ 7344.888431] [<ffffffff81129d24>] ? lru_cache_add+0xe/0x10
[ 7344.888431] [<ffffffffa0397ea8>] extent_readpages+0x160/0x1aa [btrfs]
[ 7344.888431] [<ffffffffa037eeb4>] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs]
[ 7344.888431] [<ffffffff8115daad>] ? alloc_pages_current+0xa9/0xcd
[ 7344.888431] [<ffffffffa037cdc9>] btrfs_readpages+0x1f/0x21 [btrfs]
[ 7344.888431] [<ffffffff81128316>] __do_page_cache_readahead+0x168/0x1fc
[ 7344.888431] [<ffffffff811285a0>] ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff811285a0>] ? ondemand_readahead+0x1f6/0x207
[ 7344.888431] [<ffffffff8111cf34>] ? pagecache_get_page+0x2b/0x154
[ 7344.888431] [<ffffffff8112870e>] page_cache_sync_readahead+0x3d/0x3f
[ 7344.888431] [<ffffffff8111dbf7>] generic_file_read_iter+0x197/0x4e1
[ 7344.888431] [<ffffffff8117773a>] __vfs_read+0x79/0x9d
[ 7344.888431] [<ffffffff81178050>] vfs_read+0x8f/0xd2
[ 7344.888431] [<ffffffff81178a38>] SyS_read+0x50/0x7e
[ 7344.888431] [<ffffffff81492017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0
[ 7344.888431] RIP [<ffffffffa037c88c>] btrfs_merge_bio_hook+0x54/0x6b [btrfs]
[ 7344.888431] RSP <ffff8802046878f0>
[ 7344.970544] ---[ end trace eb7803b24ebab8ae ]---
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
2016-04-26 22:39:32 +08:00
|
|
|
} else if (ret == -ENOSPC) {
|
2013-09-09 13:19:42 +08:00
|
|
|
if (!final_tried && ins->offset) {
|
|
|
|
num_bytes = min(num_bytes >> 1, ins->offset);
|
2016-06-15 21:22:56 +08:00
|
|
|
num_bytes = round_down(num_bytes,
|
2016-06-23 06:54:23 +08:00
|
|
|
fs_info->sectorsize);
|
Btrfs: fix enospc error caused by wrong checks of the chunk
When we did sysbench test for inline files, enospc error happened easily though
there was lots of free disk space which could be allocated for new chunks.
Reproduce steps:
# mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) <test partition>
# mount <test partition> /mnt
# ulimit -n 102400
# cd /mnt
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr prepare
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr run
<soon later, BUG_ON() was triggered by enospc error>
The reason of this bug is:
Now, we can reserve space which is larger than the free space in the chunks if
we have enough free disk space which can be used for new chunks. By this way,
the space allocator should allocate a new chunk by force if there is no free
space in the free space cache. But there are two wrong checks which break this
operation.
One is
if (ret == -ENOSPC && num_bytes > min_alloc_size)
in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk
even we fail to allocate free space by minimum allocable size.
The other is
if (space_info->force_alloc)
force = space_info->force_alloc;
in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone
sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen.
Fix these two wrong checks. Especially the second one, we fix it by changing
the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make
CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has
higher priority. And if the value which is passed in by the caller is greater
than ->force_alloc, use the passed value.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2012-01-27 04:01:12 +08:00
|
|
|
num_bytes = max(num_bytes, min_alloc_size);
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
ram_bytes = num_bytes;
|
Btrfs: fix enospc error caused by wrong checks of the chunk
When we did sysbench test for inline files, enospc error happened easily though
there was lots of free disk space which could be allocated for new chunks.
Reproduce steps:
# mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) <test partition>
# mount <test partition> /mnt
# ulimit -n 102400
# cd /mnt
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr prepare
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr run
<soon later, BUG_ON() was triggered by enospc error>
The reason of this bug is:
Now, we can reserve space which is larger than the free space in the chunks if
we have enough free disk space which can be used for new chunks. By this way,
the space allocator should allocate a new chunk by force if there is no free
space in the free space cache. But there are two wrong checks which break this
operation.
One is
if (ret == -ENOSPC && num_bytes > min_alloc_size)
in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk
even we fail to allocate free space by minimum allocable size.
The other is
if (space_info->force_alloc)
force = space_info->force_alloc;
in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone
sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen.
Fix these two wrong checks. Especially the second one, we fix it by changing
the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make
CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has
higher priority. And if the value which is passed in by the caller is greater
than ->force_alloc, use the passed value.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2012-01-27 04:01:12 +08:00
|
|
|
if (num_bytes == min_alloc_size)
|
|
|
|
final_tried = true;
|
|
|
|
goto again;
|
2016-09-20 22:05:02 +08:00
|
|
|
} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
|
Btrfs: fix enospc error caused by wrong checks of the chunk
When we did sysbench test for inline files, enospc error happened easily though
there was lots of free disk space which could be allocated for new chunks.
Reproduce steps:
# mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) <test partition>
# mount <test partition> /mnt
# ulimit -n 102400
# cd /mnt
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr prepare
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr run
<soon later, BUG_ON() was triggered by enospc error>
The reason of this bug is:
Now, we can reserve space which is larger than the free space in the chunks if
we have enough free disk space which can be used for new chunks. By this way,
the space allocator should allocate a new chunk by force if there is no free
space in the free space cache. But there are two wrong checks which break this
operation.
One is
if (ret == -ENOSPC && num_bytes > min_alloc_size)
in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk
even we fail to allocate free space by minimum allocable size.
The other is
if (space_info->force_alloc)
force = space_info->force_alloc;
in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone
sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen.
Fix these two wrong checks. Especially the second one, we fix it by changing
the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make
CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has
higher priority. And if the value which is passed in by the caller is greater
than ->force_alloc, use the passed value.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2012-01-27 04:01:12 +08:00
|
|
|
struct btrfs_space_info *sinfo;
|
|
|
|
|
2019-06-19 04:09:19 +08:00
|
|
|
sinfo = btrfs_find_space_info(fs_info, flags);
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 22:05:00 +08:00
|
|
|
"allocation failed flags %llu, wanted %llu",
|
|
|
|
flags, num_bytes);
|
2012-03-01 21:56:28 +08:00
|
|
|
if (sinfo)
|
2019-06-19 04:09:24 +08:00
|
|
|
btrfs_dump_space_info(fs_info, sinfo,
|
|
|
|
num_bytes, 1);
|
Btrfs: fix enospc error caused by wrong checks of the chunk
When we did sysbench test for inline files, enospc error happened easily though
there was lots of free disk space which could be allocated for new chunks.
Reproduce steps:
# mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) <test partition>
# mount <test partition> /mnt
# ulimit -n 102400
# cd /mnt
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr prepare
# sysbench --num-threads=1 --test=fileio --file-num=81920 \
> --file-total-size=80M --file-block-size=1K --file-io-mode=sync \
> --file-test-mode=seqwr run
<soon later, BUG_ON() was triggered by enospc error>
The reason of this bug is:
Now, we can reserve space which is larger than the free space in the chunks if
we have enough free disk space which can be used for new chunks. By this way,
the space allocator should allocate a new chunk by force if there is no free
space in the free space cache. But there are two wrong checks which break this
operation.
One is
if (ret == -ENOSPC && num_bytes > min_alloc_size)
in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk
even we fail to allocate free space by minimum allocable size.
The other is
if (space_info->force_alloc)
force = space_info->force_alloc;
in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone
sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen.
Fix these two wrong checks. Especially the second one, we fix it by changing
the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make
CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has
higher priority. And if the value which is passed in by the caller is greater
than ->force_alloc, use the passed value.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2012-01-27 04:01:12 +08:00
|
|
|
}
|
2008-06-26 04:01:30 +08:00
|
|
|
}
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
|
|
|
return ret;
|
2008-07-18 00:53:50 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
u64 start, u64 len,
|
|
|
|
int pin, int delalloc)
|
2008-08-02 03:11:20 +08:00
|
|
|
{
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
struct btrfs_block_group_cache *cache;
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
int ret = 0;
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
cache = btrfs_lookup_block_group(fs_info, start);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
if (!cache) {
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_err(fs_info, "Unable to find block group for %llu",
|
|
|
|
start);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
return -ENOSPC;
|
|
|
|
}
|
Btrfs: Fix free block discard calls down to the block layer
This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.
There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.
This patch change discard behavior as follows:
1, For the extents which need to be free at once,
we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
FTL will destroy file system data.
2009-01-06 04:57:51 +08:00
|
|
|
|
2011-11-01 08:52:39 +08:00
|
|
|
if (pin)
|
2019-03-20 19:12:32 +08:00
|
|
|
pin_down_extent(cache, start, len, 1);
|
2011-11-01 08:52:39 +08:00
|
|
|
else {
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_test_opt(fs_info, DISCARD))
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_discard_extent(fs_info, start, len, NULL);
|
2011-11-01 08:52:39 +08:00
|
|
|
btrfs_add_free_space(cache, start, len);
|
2016-07-25 15:51:39 +08:00
|
|
|
btrfs_free_reserved_bytes(cache, len, delalloc);
|
2016-09-07 04:00:42 +08:00
|
|
|
trace_btrfs_reserved_extent_free(fs_info, start, len);
|
2011-11-01 08:52:39 +08:00
|
|
|
}
|
2014-12-12 16:44:35 +08:00
|
|
|
|
2009-04-03 21:47:43 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2008-07-18 00:53:50 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
u64 start, u64 len, int delalloc)
|
2011-11-01 08:52:39 +08:00
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
|
2011-11-01 08:52:39 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
|
2011-11-01 08:52:39 +08:00
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
|
2011-11-01 08:52:39 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
u64 parent, u64 root_objectid,
|
|
|
|
u64 flags, u64 owner, u64 offset,
|
|
|
|
struct btrfs_key *ins, int ref_mod)
|
2008-07-18 00:53:50 +08:00
|
|
|
{
|
2018-06-20 20:48:58 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2008-07-18 00:53:50 +08:00
|
|
|
int ret;
|
|
|
|
struct btrfs_extent_item *extent_item;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_extent_inline_ref *iref;
|
2008-07-18 00:53:50 +08:00
|
|
|
struct btrfs_path *path;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
int type;
|
|
|
|
u32 size;
|
2007-08-09 08:17:12 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (parent > 0)
|
|
|
|
type = BTRFS_SHARED_DATA_REF_KEY;
|
|
|
|
else
|
|
|
|
type = BTRFS_EXTENT_DATA_REF_KEY;
|
2007-08-30 03:47:34 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
|
2007-12-11 22:25:06 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-03-23 16:14:16 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2008-02-02 03:51:59 +08:00
|
|
|
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
|
|
|
|
ins, size);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
extent_item = btrfs_item_ptr(leaf, path->slots[0],
|
2008-02-02 03:51:59 +08:00
|
|
|
struct btrfs_extent_item);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_set_extent_refs(leaf, extent_item, ref_mod);
|
|
|
|
btrfs_set_extent_generation(leaf, extent_item, trans->transid);
|
|
|
|
btrfs_set_extent_flags(leaf, extent_item,
|
|
|
|
flags | BTRFS_EXTENT_FLAG_DATA);
|
|
|
|
|
|
|
|
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
|
|
|
|
btrfs_set_extent_inline_ref_type(leaf, iref, type);
|
|
|
|
if (parent > 0) {
|
|
|
|
struct btrfs_shared_data_ref *ref;
|
|
|
|
ref = (struct btrfs_shared_data_ref *)(iref + 1);
|
|
|
|
btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
|
|
|
|
btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
|
|
|
|
} else {
|
|
|
|
struct btrfs_extent_data_ref *ref;
|
|
|
|
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
|
|
|
|
btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
|
|
|
|
btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
|
|
|
|
btrfs_set_extent_data_ref_offset(leaf, ref, offset);
|
|
|
|
btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
|
|
|
|
}
|
2008-02-02 03:51:59 +08:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
2007-12-11 22:25:06 +08:00
|
|
|
btrfs_free_path(path);
|
2007-10-16 04:14:48 +08:00
|
|
|
|
2018-05-10 20:44:54 +08:00
|
|
|
ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
|
2015-09-30 11:50:37 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2019-03-20 19:10:15 +08:00
|
|
|
ret = update_block_group(trans, ins->objectid, ins->offset, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) { /* -ENOENT, logic error */
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_err(fs_info, "update block group failed for %llu %llu",
|
2013-08-20 19:20:07 +08:00
|
|
|
ins->objectid, ins->offset);
|
2008-02-04 23:10:13 +08:00
|
|
|
BUG();
|
|
|
|
}
|
2016-09-07 04:00:42 +08:00
|
|
|
trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
|
2008-07-18 00:53:50 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
|
2018-05-21 17:27:21 +08:00
|
|
|
struct btrfs_delayed_ref_node *node,
|
2018-05-21 17:27:22 +08:00
|
|
|
struct btrfs_delayed_extent_op *extent_op)
|
2008-07-18 00:53:50 +08:00
|
|
|
{
|
2018-05-21 17:27:20 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2008-07-18 00:53:50 +08:00
|
|
|
int ret;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_extent_item *extent_item;
|
2018-05-21 17:27:21 +08:00
|
|
|
struct btrfs_key extent_key;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_tree_block_info *block_info;
|
|
|
|
struct btrfs_extent_inline_ref *iref;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
2018-05-21 17:27:21 +08:00
|
|
|
struct btrfs_delayed_tree_ref *ref;
|
2013-03-08 03:22:04 +08:00
|
|
|
u32 size = sizeof(*extent_item) + sizeof(*iref);
|
2018-05-21 17:27:21 +08:00
|
|
|
u64 num_bytes;
|
2018-05-21 17:27:22 +08:00
|
|
|
u64 flags = extent_op->flags_to_set;
|
2016-06-23 06:54:23 +08:00
|
|
|
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
|
2013-03-08 03:22:04 +08:00
|
|
|
|
2018-05-21 17:27:21 +08:00
|
|
|
ref = btrfs_delayed_node_to_tree_ref(node);
|
|
|
|
|
|
|
|
extent_key.objectid = node->bytenr;
|
|
|
|
if (skinny_metadata) {
|
|
|
|
extent_key.offset = ref->level;
|
|
|
|
extent_key.type = BTRFS_METADATA_ITEM_KEY;
|
|
|
|
num_bytes = fs_info->nodesize;
|
|
|
|
} else {
|
|
|
|
extent_key.offset = node->num_bytes;
|
|
|
|
extent_key.type = BTRFS_EXTENT_ITEM_KEY;
|
2013-03-08 03:22:04 +08:00
|
|
|
size += sizeof(*block_info);
|
2018-05-21 17:27:21 +08:00
|
|
|
num_bytes = node->num_bytes;
|
|
|
|
}
|
2008-09-24 01:14:13 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path = btrfs_alloc_path();
|
2018-10-12 03:54:22 +08:00
|
|
|
if (!path)
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
return -ENOMEM;
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path->leave_spinning = 1;
|
|
|
|
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
|
2018-05-21 17:27:21 +08:00
|
|
|
&extent_key, size);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
Btrfs: free and unlock our path before btrfs_free_and_pin_reserved_extent()
The error handling path for alloc_reserved_tree_block is calling
btrfs_free_and_pin_reserved_extent with a spinning tree lock held. This
might sleep as we allocate extent_state objects:
BUG: sleeping function called from invalid context at mm/slub.c:1268
in_atomic(): 1, irqs_disabled(): 0, pid: 11093, name: kworker/u4:7
5 locks held by kworker/u4:7/11093:
#0: ("%s-%s""btrfs", name){++++.+}, at: [<ffffffff81091d51>] process_one_work+0x151/0x520
#1: ((&work->normal_work)){+.+.+.}, at: [<ffffffff81091d51>] process_one_work+0x151/0x520
#2: (sb_internal){++++.+}, at: [<ffffffffa003a70e>] start_transaction+0x43e/0x590 [btrfs]
#3: (&head_ref->mutex){+.+...}, at: [<ffffffffa0089f8c>] btrfs_delayed_ref_lock+0x4c/0x240 [btrfs]
#4: (btrfs-extent-00){++++..}, at: [<ffffffffa007697b>] btrfs_clear_lock_blocking_rw+0x9b/0x150 [btrfs]
CPU: 0 PID: 11093 Comm: kworker/u4:7 Tainted: G W 4.0.0-rc6-default+ #246
Hardware name: Intel Corporation Santa Rosa platform/Matanzas, BIOS TSRSCRB1.86C.0047.B00.0610170821 10/17/06
Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
00000000000004f4 ffff88006dd17848 ffffffff81ab0e3b ffff88006dd17848
ffff88007a944760 ffff88006dd17868 ffffffff8109d516 ffff88006dd17898
0000000000000000 ffff88006dd17898 ffffffff8109d5b2 ffffffff81aba2bb
Call Trace:
[<ffffffff81ab0e3b>] dump_stack+0x4f/0x6c
[<ffffffff8109d516>] ___might_sleep+0xf6/0x140
[<ffffffff8109d5b2>] __might_sleep+0x52/0x90
[<ffffffff81aba2bb>] ? ftrace_call+0x5/0x34
[<ffffffff81196363>] kmem_cache_alloc+0x163/0x1b0
[<ffffffffa0056f31>] ? alloc_extent_state+0x31/0x150 [btrfs]
[<ffffffffa0056f20>] ? alloc_extent_state+0x20/0x150 [btrfs]
[<ffffffffa0056f31>] alloc_extent_state+0x31/0x150 [btrfs]
[<ffffffffa005805b>] __set_extent_bit+0x37b/0x5d0 [btrfs]
[<ffffffff81aba2bb>] ? ftrace_call+0x5/0x34
[<ffffffffa005888d>] ? set_extent_bit+0xd/0x30 [btrfs]
[<ffffffffa00588a3>] set_extent_bit+0x23/0x30 [btrfs]
[<ffffffffa0058e80>] set_extent_dirty+0x20/0x30 [btrfs]
[<ffffffffa00195ba>] pin_down_extent+0xaa/0x170 [btrfs]
[<ffffffffa001d8ef>] __btrfs_free_reserved_extent+0xcf/0x160 [btrfs]
[<ffffffffa0023856>] btrfs_free_and_pin_reserved_extent+0x16/0x20 [btrfs]
[<ffffffffa002482a>] __btrfs_run_delayed_refs+0xfca/0x1290 [btrfs]
[<ffffffffa0026eae>] btrfs_run_delayed_refs+0x6e/0x2e0 [btrfs]
[<ffffffffa0027378>] delayed_ref_async_start+0x48/0xb0 [btrfs]
[<ffffffffa006c883>] normal_work_helper+0x83/0x350 [btrfs]
[<ffffffffa006cd79>] ? btrfs_extent_refs_helper+0x9/0x20 [btrfs]
[<ffffffffa006cd82>] btrfs_extent_refs_helper+0x12/0x20 [btrfs]
[<ffffffff81091dcb>] process_one_work+0x1cb/0x520
[<ffffffff81091d51>] ? process_one_work+0x151/0x520
[<ffffffff811c7abf>] ? seq_read+0x3f/0x400
[<ffffffff8109260b>] worker_thread+0x5b/0x4e0
[<ffffffff81097be2>] ? __kthread_parkme+0x12/0xa0
[<ffffffff810925b0>] ? rescuer_thread+0x450/0x450
[<ffffffff81098686>] kthread+0xf6/0x120
[<ffffffff81098590>] ? flush_kthread_worker+0x1b0/0x1b0
[<ffffffff81ab8088>] ret_from_fork+0x58/0x90
[<ffffffff81098590>] ? flush_kthread_worker+0x1b0/0x1b0
------------[ cut here ]------------
This changes things to free the path first, which will also unlock the
extent buffer.
Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: Dave Sterba <dsterba@suse.cz>
Tested-by: Dave Sterba <dsterba@suse.cz>
2015-04-01 23:36:05 +08:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 23:03:00 +08:00
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
extent_item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_extent_item);
|
|
|
|
btrfs_set_extent_refs(leaf, extent_item, 1);
|
|
|
|
btrfs_set_extent_generation(leaf, extent_item, trans->transid);
|
|
|
|
btrfs_set_extent_flags(leaf, extent_item,
|
|
|
|
flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
|
|
|
|
|
2013-03-08 03:22:04 +08:00
|
|
|
if (skinny_metadata) {
|
|
|
|
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
|
|
|
|
} else {
|
|
|
|
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
|
2018-05-21 17:27:22 +08:00
|
|
|
btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
|
2018-05-21 17:27:21 +08:00
|
|
|
btrfs_set_tree_block_level(leaf, block_info, ref->level);
|
2013-03-08 03:22:04 +08:00
|
|
|
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2018-05-21 17:27:23 +08:00
|
|
|
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
|
|
|
|
btrfs_set_extent_inline_ref_type(leaf, iref,
|
|
|
|
BTRFS_SHARED_BLOCK_REF_KEY);
|
2018-05-21 17:27:23 +08:00
|
|
|
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
} else {
|
|
|
|
btrfs_set_extent_inline_ref_type(leaf, iref,
|
|
|
|
BTRFS_TREE_BLOCK_REF_KEY);
|
2018-05-21 17:27:21 +08:00
|
|
|
btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2018-05-21 17:27:21 +08:00
|
|
|
ret = remove_from_free_space_tree(trans, extent_key.objectid,
|
|
|
|
num_bytes);
|
2015-09-30 11:50:37 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2019-03-20 19:10:15 +08:00
|
|
|
ret = update_block_group(trans, extent_key.objectid,
|
2016-06-23 06:54:22 +08:00
|
|
|
fs_info->nodesize, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) { /* -ENOENT, logic error */
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_err(fs_info, "update block group failed for %llu %llu",
|
2018-05-21 17:27:21 +08:00
|
|
|
extent_key.objectid, extent_key.offset);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
BUG();
|
|
|
|
}
|
2013-10-08 03:18:52 +08:00
|
|
|
|
2018-05-21 17:27:21 +08:00
|
|
|
trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
|
2016-06-23 06:54:23 +08:00
|
|
|
fs_info->nodesize);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
|
2017-09-30 03:43:49 +08:00
|
|
|
struct btrfs_root *root, u64 owner,
|
2015-10-26 14:11:18 +08:00
|
|
|
u64 offset, u64 ram_bytes,
|
|
|
|
struct btrfs_key *ins)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2019-04-04 14:45:32 +08:00
|
|
|
struct btrfs_ref generic_ref = { 0 };
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int ret;
|
|
|
|
|
2017-09-30 03:43:49 +08:00
|
|
|
BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2019-04-04 14:45:32 +08:00
|
|
|
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
|
|
|
|
ins->objectid, ins->offset, 0);
|
|
|
|
btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
|
2019-04-04 14:45:33 +08:00
|
|
|
btrfs_ref_tree_mod(root->fs_info, &generic_ref);
|
2019-04-04 14:45:32 +08:00
|
|
|
ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
|
|
|
|
ram_bytes, NULL, NULL);
|
2008-07-18 00:53:50 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2008-09-06 04:13:11 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this is used by the tree logging recovery code. It records that
|
|
|
|
* an extent has been allocated and makes sure to clear the free
|
|
|
|
* space cache bits as well
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
u64 root_objectid, u64 owner, u64 offset,
|
|
|
|
struct btrfs_key *ins)
|
2008-09-06 04:13:11 +08:00
|
|
|
{
|
2018-06-20 20:49:13 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2008-09-06 04:13:11 +08:00
|
|
|
int ret;
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
2016-08-26 11:33:14 +08:00
|
|
|
struct btrfs_space_info *space_info;
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2013-06-07 01:19:32 +08:00
|
|
|
/*
|
|
|
|
* Mixed block groups will exclude before processing the log so we only
|
2016-05-20 09:18:45 +08:00
|
|
|
* need to do the exclude dance if this fs isn't mixed.
|
2013-06-07 01:19:32 +08:00
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = __exclude_logged_extent(fs_info, ins->objectid,
|
|
|
|
ins->offset);
|
2013-04-26 03:55:30 +08:00
|
|
|
if (ret)
|
2013-06-07 01:19:32 +08:00
|
|
|
return ret;
|
2009-09-12 04:11:19 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
|
2013-06-07 01:19:32 +08:00
|
|
|
if (!block_group)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-08-26 11:33:14 +08:00
|
|
|
space_info = block_group->space_info;
|
|
|
|
spin_lock(&space_info->lock);
|
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
space_info->bytes_reserved += ins->offset;
|
|
|
|
block_group->reserved += ins->offset;
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
|
2018-06-20 20:48:58 +08:00
|
|
|
ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
|
|
|
|
offset, ins, 1);
|
2013-04-26 03:55:30 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
2008-09-06 04:13:11 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-04-26 04:41:01 +08:00
|
|
|
static struct extent_buffer *
|
|
|
|
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
2018-06-18 19:13:19 +08:00
|
|
|
u64 bytenr, int level, u64 owner)
|
2008-08-02 03:11:20 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-08-02 03:11:20 +08:00
|
|
|
struct extent_buffer *buf;
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
buf = btrfs_find_create_tree_block(fs_info, bytenr);
|
2016-06-07 03:01:23 +08:00
|
|
|
if (IS_ERR(buf))
|
|
|
|
return buf;
|
|
|
|
|
btrfs: locking: Add extra check in btrfs_init_new_buffer() to avoid deadlock
[BUG]
For certain crafted image, whose csum root leaf has missing backref, if
we try to trigger write with data csum, it could cause deadlock with the
following kernel WARN_ON():
WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400
CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
Workqueue: btrfs-endio-write btrfs_endio_write_helper
RIP: 0010:btrfs_tree_lock+0x3e2/0x400
Call Trace:
btrfs_alloc_tree_block+0x39f/0x770
__btrfs_cow_block+0x285/0x9e0
btrfs_cow_block+0x191/0x2e0
btrfs_search_slot+0x492/0x1160
btrfs_lookup_csum+0xec/0x280
btrfs_csum_file_blocks+0x2be/0xa60
add_pending_csums+0xaf/0xf0
btrfs_finish_ordered_io+0x74b/0xc90
finish_ordered_fn+0x15/0x20
normal_work_helper+0xf6/0x500
btrfs_endio_write_helper+0x12/0x20
process_one_work+0x302/0x770
worker_thread+0x81/0x6d0
kthread+0x180/0x1d0
ret_from_fork+0x35/0x40
[CAUSE]
That crafted image has missing backref for csum tree root leaf. And
when we try to allocate new tree block, since there is no
EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
and use it.
The extent tree of the image looks like:
Normal image | This fuzzed image
----------------------------------+--------------------------------
BG 29360128 | BG 29360128
One empty slot | One empty slot
29364224: backref to UUID tree | 29364224: backref to UUID tree
Two empty slots | Two empty slots
29376512: backref to CSUM tree | One empty slot (bad type) <<<
29380608: backref to D_RELOC tree | 29380608: backref to D_RELOC tree
... | ...
Since bytenr 29376512 has no METADATA/EXTENT_ITEM, when btrfs try to
alloc tree block, it's an valid slot for btrfs.
And for finish_ordered_write, when we need to insert csum, we try to CoW
csum tree root.
By accident, empty slots at bytenr BG_OFFSET, BG_OFFSET + 8K,
BG_OFFSET + 12K is already used by tree block COW for other trees, the
next empty slot is BG_OFFSET + 16K, which should be the backref for CSUM
tree.
But due to the bad type, btrfs can recognize it and still consider it as
an empty slot, and will try to use it for csum tree CoW.
Then in the following call trace, we will try to lock the new tree
block, which turns out to be the old csum tree root which is already
locked:
btrfs_search_slot() called on csum tree root, which is at 29376512
|- btrfs_cow_block()
|- btrfs_set_lock_block()
| |- Now locks tree block 29376512 (old csum tree root)
|- __btrfs_cow_block()
|- btrfs_alloc_tree_block()
|- btrfs_reserve_extent()
| Now it returns tree block 29376512, which extent tree
| shows its empty slot, but it's already hold by csum tree
|- btrfs_init_new_buffer()
|- btrfs_tree_lock()
| Triggers WARN_ON(eb->lock_owner == current->pid)
|- wait_event()
Wait lock owner to release the lock, but it's
locked by ourself, so it will deadlock
[FIX]
This patch will do the lock_owner and current->pid check at
btrfs_init_new_buffer().
So above deadlock can be avoided.
Since such problem can only happen in crafted image, we will still
trigger kernel warning for later aborted transaction, but with a little
more meaningful warning message.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405
Reported-by: Xu Wen <wen.xu@gatech.edu>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-21 09:53:47 +08:00
|
|
|
/*
|
|
|
|
* Extra safety check in case the extent tree is corrupted and extent
|
|
|
|
* allocator chooses to use a tree block which is already used and
|
|
|
|
* locked.
|
|
|
|
*/
|
|
|
|
if (buf->lock_owner == current->pid) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
|
|
|
|
buf->start, btrfs_header_owner(buf), current->pid);
|
|
|
|
free_extent_buffer(buf);
|
|
|
|
return ERR_PTR(-EUCLEAN);
|
|
|
|
}
|
|
|
|
|
2011-07-27 04:11:19 +08:00
|
|
|
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
|
2008-08-02 03:11:20 +08:00
|
|
|
btrfs_tree_lock(buf);
|
2019-03-20 21:30:02 +08:00
|
|
|
btrfs_clean_tree_block(buf);
|
2012-03-10 05:01:49 +08:00
|
|
|
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 22:25:08 +08:00
|
|
|
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(buf);
|
2015-12-03 20:06:46 +08:00
|
|
|
set_extent_buffer_uptodate(buf);
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 22:25:08 +08:00
|
|
|
|
2018-06-18 19:13:19 +08:00
|
|
|
memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
|
|
|
|
btrfs_set_header_level(buf, level);
|
|
|
|
btrfs_set_header_bytenr(buf, buf->start);
|
|
|
|
btrfs_set_header_generation(buf, trans->transid);
|
|
|
|
btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
|
|
|
|
btrfs_set_header_owner(buf, owner);
|
2018-10-30 22:43:24 +08:00
|
|
|
write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
|
2018-06-18 19:13:19 +08:00
|
|
|
write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
|
2008-09-12 04:17:57 +08:00
|
|
|
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
|
Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time
to call btree_inode->i_mapping->a_ops->writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
"parent transid verify failed on 10826481664 wanted 25748 found 29562"
when reading btree nodes/leafs from disk).
Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.
Using the new 3 flags for the btree inode also makes us achieve the
goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
writeback for all dirty pages and before filemap_fdatawait_range() is
called, the writeback for all dirty pages had already finished with
errors - because we were not using AS_EIO/AS_ENOSPC,
filemap_fdatawait_range() would return success, as it could not know
that writeback errors happened (the pages were no longer tagged for
writeback).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-26 19:25:56 +08:00
|
|
|
buf->log_index = root->log_transid % 2;
|
2009-11-12 17:33:26 +08:00
|
|
|
/*
|
|
|
|
* we allow two log transactions at a time, use different
|
2018-11-28 19:05:13 +08:00
|
|
|
* EXTENT bit to differentiate dirty pages.
|
2009-11-12 17:33:26 +08:00
|
|
|
*/
|
Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time
to call btree_inode->i_mapping->a_ops->writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
"parent transid verify failed on 10826481664 wanted 25748 found 29562"
when reading btree nodes/leafs from disk).
Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.
Using the new 3 flags for the btree inode also makes us achieve the
goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
writeback for all dirty pages and before filemap_fdatawait_range() is
called, the writeback for all dirty pages had already finished with
errors - because we were not using AS_EIO/AS_ENOSPC,
filemap_fdatawait_range() would return success, as it could not know
that writeback errors happened (the pages were no longer tagged for
writeback).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-26 19:25:56 +08:00
|
|
|
if (buf->log_index == 0)
|
2009-11-12 17:33:26 +08:00
|
|
|
set_extent_dirty(&root->dirty_log_pages, buf->start,
|
|
|
|
buf->start + buf->len - 1, GFP_NOFS);
|
|
|
|
else
|
|
|
|
set_extent_new(&root->dirty_log_pages, buf->start,
|
2016-04-27 05:54:39 +08:00
|
|
|
buf->start + buf->len - 1);
|
2008-09-12 04:17:57 +08:00
|
|
|
} else {
|
Btrfs: be aware of btree inode write errors to avoid fs corruption
While we have a transaction ongoing, the VM might decide at any time
to call btree_inode->i_mapping->a_ops->writepages(), which will start
writeback of dirty pages belonging to btree nodes/leafs. This call
might return an error or the writeback might finish with an error
before we attempt to commit the running transaction. If this happens,
we might have no way of knowing that such error happened when we are
committing the transaction - because the pages might no longer be
marked dirty nor tagged for writeback (if a subsequent modification
to the extent buffer didn't happen before the transaction commit) which
makes filemap_fdata[write|wait]_range unable to find such pages (even
if they're marked with SetPageError).
So if this happens we must abort the transaction, otherwise we commit
a super block with btree roots that point to btree nodes/leafs whose
content on disk is invalid - either garbage or the content of some
node/leaf from a past generation that got cowed or deleted and is no
longer valid (for this later case we end up getting error messages like
"parent transid verify failed on 10826481664 wanted 25748 found 29562"
when reading btree nodes/leafs from disk).
Note that setting and checking AS_EIO/AS_ENOSPC in the btree inode's
i_mapping would not be enough because we need to distinguish between
log tree extents (not fatal) vs non-log tree extents (fatal) and
because the next call to filemap_fdatawait_range() will catch and clear
such errors in the mapping - and that call might be from a log sync and
not from a transaction commit, which means we would not know about the
error at transaction commit time. Also, checking for the eb flag
EXTENT_BUFFER_IOERR at transaction commit time isn't done and would
not be completely reliable, as the eb might be removed from memory and
read back when trying to get it, which clears that flag right before
reading the eb's pages from disk, making us not know about the previous
write error.
Using the new 3 flags for the btree inode also makes us achieve the
goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
writeback for all dirty pages and before filemap_fdatawait_range() is
called, the writeback for all dirty pages had already finished with
errors - because we were not using AS_EIO/AS_ENOSPC,
filemap_fdatawait_range() would return success, as it could not know
that writeback errors happened (the pages were no longer tagged for
writeback).
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-26 19:25:56 +08:00
|
|
|
buf->log_index = -1;
|
2008-09-12 04:17:57 +08:00
|
|
|
set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
|
2008-08-02 03:11:20 +08:00
|
|
|
buf->start + buf->len - 1, GFP_NOFS);
|
2008-09-12 04:17:57 +08:00
|
|
|
}
|
2016-06-08 12:36:38 +08:00
|
|
|
trans->dirty = true;
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 22:25:08 +08:00
|
|
|
/* this returns a buffer locked for blocking */
|
2008-08-02 03:11:20 +08:00
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2007-02-26 23:40:21 +08:00
|
|
|
/*
|
2010-05-16 22:46:25 +08:00
|
|
|
* finds a free extent and does all the dirty work required for allocation
|
2015-02-24 18:47:04 +08:00
|
|
|
* returns the tree buffer or an ERR_PTR on error.
|
2007-02-26 23:40:21 +08:00
|
|
|
*/
|
2014-06-15 07:54:12 +08:00
|
|
|
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
|
2017-01-18 15:24:37 +08:00
|
|
|
struct btrfs_root *root,
|
|
|
|
u64 parent, u64 root_objectid,
|
|
|
|
const struct btrfs_disk_key *key,
|
|
|
|
int level, u64 hint,
|
|
|
|
u64 empty_size)
|
2007-02-26 23:40:21 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-03-13 04:22:34 +08:00
|
|
|
struct btrfs_key ins;
|
2010-05-16 22:46:25 +08:00
|
|
|
struct btrfs_block_rsv *block_rsv;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *buf;
|
2015-02-24 18:47:04 +08:00
|
|
|
struct btrfs_delayed_extent_op *extent_op;
|
2019-04-04 14:45:31 +08:00
|
|
|
struct btrfs_ref generic_ref = { 0 };
|
2010-05-16 22:46:25 +08:00
|
|
|
u64 flags = 0;
|
|
|
|
int ret;
|
2016-06-23 06:54:23 +08:00
|
|
|
u32 blocksize = fs_info->nodesize;
|
|
|
|
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
|
2007-02-26 23:40:21 +08:00
|
|
|
|
2016-07-15 21:23:37 +08:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_is_testing(fs_info)) {
|
2014-05-08 05:06:09 +08:00
|
|
|
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
|
2018-06-18 19:13:19 +08:00
|
|
|
level, root_objectid);
|
2014-05-08 05:06:09 +08:00
|
|
|
if (!IS_ERR(buf))
|
|
|
|
root->alloc_bytenr += blocksize;
|
|
|
|
return buf;
|
|
|
|
}
|
2016-07-15 21:23:37 +08:00
|
|
|
#endif
|
2014-09-30 05:53:21 +08:00
|
|
|
|
2019-06-20 01:47:23 +08:00
|
|
|
block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
|
2010-05-16 22:46:25 +08:00
|
|
|
if (IS_ERR(block_rsv))
|
|
|
|
return ERR_CAST(block_rsv);
|
|
|
|
|
btrfs: update btrfs_space_info's bytes_may_use timely
This patch can fix some false ENOSPC errors, below test script can
reproduce one false ENOSPC error:
#!/bin/bash
dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128
dev=$(losetup --show -f fs.img)
mkfs.btrfs -f -M $dev
mkdir /tmp/mntpoint
mount $dev /tmp/mntpoint
cd /tmp/mntpoint
xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile
Above script will fail for ENOSPC reason, but indeed fs still has free
space to satisfy this request. Please see call graph:
btrfs_fallocate()
|-> btrfs_alloc_data_chunk_ondemand()
| bytes_may_use += 64M
|-> btrfs_prealloc_file_range()
|-> btrfs_reserve_extent()
|-> btrfs_add_reserved_bytes()
| alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not
| change bytes_may_use, and bytes_reserved += 64M. Now
| bytes_may_use + bytes_reserved == 128M, which is greater
| than btrfs_space_info's total_bytes, false enospc occurs.
| Note, the bytes_may_use decrease operation will be done in
| end of btrfs_fallocate(), which is too late.
Here is another simple case for buffered write:
CPU 1 | CPU 2
|
|-> cow_file_range() |-> __btrfs_buffered_write()
|-> btrfs_reserve_extent() | |
| | |
| | |
| ..... | |-> btrfs_check_data_free_space()
| |
| |
|-> extent_clear_unlock_delalloc() |
In CPU 1, btrfs_reserve_extent()->find_free_extent()->
btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease
operation will be delayed to be done in extent_clear_unlock_delalloc().
Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's
btrfs_check_data_free_space() tries to reserve 100MB data space.
If
100MB > data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_reserved - data_sinfo->bytes_pinned -
data_sinfo->bytes_readonly - data_sinfo->bytes_may_use
btrfs_check_data_free_space() will try to allcate new data chunk or call
btrfs_start_delalloc_roots(), or commit current transaction in order to
reserve some free space, obviously a lot of work. But indeed it's not
necessary as long as decreasing bytes_may_use timely, we still have
free space, decreasing 128M from bytes_may_use.
To fix this issue, this patch chooses to update bytes_may_use for both
data and metadata in btrfs_add_reserved_bytes(). For compress path, real
extent length may not be equal to file content length, so introduce a
ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and
btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by
file content length. Then compress path can update bytes_may_use
correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC
and RESERVE_FREE.
As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In
run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as
PREALLOC, we also need to update bytes_may_use, but can not pass
EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so
here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook()
to update btrfs_space_info's bytes_may_use.
Meanwhile __btrfs_prealloc_file_range() will call
btrfs_free_reserved_data_space() internally for both sucessful and failed
path, btrfs_prealloc_file_range()'s callers does not need to call
btrfs_free_reserved_data_space() any more.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-07-25 15:51:40 +08:00
|
|
|
ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
empty_size, hint, &ins, 0, 0);
|
2015-02-24 18:47:04 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_unuse;
|
2008-01-10 04:55:33 +08:00
|
|
|
|
2018-06-18 19:13:19 +08:00
|
|
|
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
|
|
|
|
root_objectid);
|
2015-02-24 18:47:04 +08:00
|
|
|
if (IS_ERR(buf)) {
|
|
|
|
ret = PTR_ERR(buf);
|
|
|
|
goto out_free_reserved;
|
|
|
|
}
|
2010-05-16 22:46:25 +08:00
|
|
|
|
|
|
|
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
|
|
|
|
if (parent == 0)
|
|
|
|
parent = ins.objectid;
|
|
|
|
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
|
|
|
|
} else
|
|
|
|
BUG_ON(parent > 0);
|
|
|
|
|
|
|
|
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
|
2012-11-21 10:21:28 +08:00
|
|
|
extent_op = btrfs_alloc_delayed_extent_op();
|
2015-02-24 18:47:04 +08:00
|
|
|
if (!extent_op) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free_buf;
|
|
|
|
}
|
2010-05-16 22:46:25 +08:00
|
|
|
if (key)
|
|
|
|
memcpy(&extent_op->key, key, sizeof(extent_op->key));
|
|
|
|
else
|
|
|
|
memset(&extent_op->key, 0, sizeof(extent_op->key));
|
|
|
|
extent_op->flags_to_set = flags;
|
2015-11-30 23:51:29 +08:00
|
|
|
extent_op->update_key = skinny_metadata ? false : true;
|
|
|
|
extent_op->update_flags = true;
|
|
|
|
extent_op->is_data = false;
|
2013-05-10 01:49:30 +08:00
|
|
|
extent_op->level = level;
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2019-04-04 14:45:31 +08:00
|
|
|
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
|
|
|
|
ins.objectid, ins.offset, parent);
|
|
|
|
generic_ref.real_root = root->root_key.objectid;
|
|
|
|
btrfs_init_tree_ref(&generic_ref, level, root_objectid);
|
2019-04-04 14:45:33 +08:00
|
|
|
btrfs_ref_tree_mod(fs_info, &generic_ref);
|
2019-04-04 14:45:31 +08:00
|
|
|
ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
|
2017-06-07 07:45:30 +08:00
|
|
|
extent_op, NULL, NULL);
|
2015-02-24 18:47:04 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_free_delayed;
|
2010-05-16 22:46:25 +08:00
|
|
|
}
|
2007-02-26 23:40:21 +08:00
|
|
|
return buf;
|
2015-02-24 18:47:04 +08:00
|
|
|
|
|
|
|
out_free_delayed:
|
|
|
|
btrfs_free_delayed_extent_op(extent_op);
|
|
|
|
out_free_buf:
|
|
|
|
free_extent_buffer(buf);
|
|
|
|
out_free_reserved:
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
|
2015-02-24 18:47:04 +08:00
|
|
|
out_unuse:
|
2019-06-20 01:47:23 +08:00
|
|
|
btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
|
2015-02-24 18:47:04 +08:00
|
|
|
return ERR_PTR(ret);
|
2007-02-26 23:40:21 +08:00
|
|
|
}
|
2007-03-07 09:08:01 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
struct walk_control {
|
|
|
|
u64 refs[BTRFS_MAX_LEVEL];
|
|
|
|
u64 flags[BTRFS_MAX_LEVEL];
|
|
|
|
struct btrfs_key update_progress;
|
btrfs: save drop_progress if we drop refs at all
Previously we only updated the drop_progress key if we were in the
DROP_REFERENCE stage of snapshot deletion. This is because the
UPDATE_BACKREF stage checks the flags of the blocks it's converting to
FULL_BACKREF, so if we go over a block we processed before it doesn't
matter, we just don't do anything.
The problem is in do_walk_down() we will go ahead and drop the roots
reference to any blocks that we know we won't need to walk into.
Given subvolume A and snapshot B. The root of B points to all of the
nodes that belong to A, so all of those nodes have a refcnt > 1. If B
did not modify those blocks it'll hit this condition in do_walk_down
if (!wc->update_ref ||
generation <= root->root_key.offset)
goto skip;
and in "goto skip" we simply do a btrfs_free_extent() for that bytenr
that we point at.
Now assume we modified some data in B, and then took a snapshot of B and
call it C. C points to all the nodes in B, making every node the root
of B points to have a refcnt > 1. This assumes the root level is 2 or
higher.
We delete snapshot B, which does the above work in do_walk_down,
free'ing our ref for nodes we share with A that we didn't modify. Now
we hit a node we _did_ modify, thus we own. We need to walk down into
this node and we set wc->stage == UPDATE_BACKREF. We walk down to level
0 which we also own because we modified data. We can't walk any further
down and thus now need to walk up and start the next part of the
deletion. Now walk_up_proc is supposed to put us back into
DROP_REFERENCE, but there's an exception to this
if (level < wc->shared_level)
goto out;
we are at level == 0, and our shared_level == 1. We skip out of this
one and go up to level 1. Since path->slots[1] < nritems we
path->slots[1]++ and break out of walk_up_tree to stop our transaction
and loop back around. Now in btrfs_drop_snapshot we have this snippet
if (wc->stage == DROP_REFERENCE) {
level = wc->level;
btrfs_node_key(path->nodes[level],
&root_item->drop_progress,
path->slots[level]);
root_item->drop_level = level;
}
our stage == UPDATE_BACKREF still, so we don't update the drop_progress
key. This is a problem because we would have done btrfs_free_extent()
for the nodes leading up to our current position. If we crash or
unmount here and go to remount we'll start over where we were before and
try to free our ref for blocks we've already freed, and thus abort()
out.
Fix this by keeping track of the last place we dropped a reference for
our block in do_walk_down. Then if wc->stage == UPDATE_BACKREF we know
we'll start over from a place we meant to, and otherwise things continue
to work as they did before.
I have a complicated reproducer for this problem, without this patch
we'll fail to fsck the fs when replaying the log writes log. With this
patch we can replay the whole log without any fsck or mount failures.
The steps to reproduce this easily are sort of tricky, I had to add a
couple of debug patches to the kernel in order to make it easy,
basically I just needed to make sure we did actually commit the
transaction every time we finished a walk_down_tree/walk_up_tree combo.
The reproducer:
1) Creates a base subvolume.
2) Creates 100k files in the subvolume.
3) Snapshots the base subvolume (snap1).
4) Touches files 5000-6000 in snap1.
5) Snapshots snap1 (snap2).
6) Deletes snap1.
I do this with dm-log-writes, and then replay to every FUA in the log
and fsck the fs.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ copy reproducer steps ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-02-07 04:46:15 +08:00
|
|
|
struct btrfs_key drop_progress;
|
|
|
|
int drop_level;
|
2009-06-28 09:07:35 +08:00
|
|
|
int stage;
|
|
|
|
int level;
|
|
|
|
int shared_level;
|
|
|
|
int update_ref;
|
|
|
|
int keep_locks;
|
2009-09-22 03:55:59 +08:00
|
|
|
int reada_slot;
|
|
|
|
int reada_count;
|
2019-02-07 04:46:14 +08:00
|
|
|
int restarted;
|
2009-06-28 09:07:35 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#define DROP_REFERENCE 1
|
|
|
|
#define UPDATE_BACKREF 2
|
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct walk_control *wc,
|
|
|
|
struct btrfs_path *path)
|
2007-03-27 18:33:00 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2009-09-22 03:55:59 +08:00
|
|
|
u64 bytenr;
|
|
|
|
u64 generation;
|
|
|
|
u64 refs;
|
2009-10-09 21:25:16 +08:00
|
|
|
u64 flags;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u32 nritems;
|
2009-09-22 03:55:59 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *eb;
|
2007-03-27 18:33:00 +08:00
|
|
|
int ret;
|
2009-09-22 03:55:59 +08:00
|
|
|
int slot;
|
|
|
|
int nread = 0;
|
2007-03-27 18:33:00 +08:00
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
if (path->slots[wc->level] < wc->reada_slot) {
|
|
|
|
wc->reada_count = wc->reada_count * 2 / 3;
|
|
|
|
wc->reada_count = max(wc->reada_count, 2);
|
|
|
|
} else {
|
|
|
|
wc->reada_count = wc->reada_count * 3 / 2;
|
|
|
|
wc->reada_count = min_t(int, wc->reada_count,
|
2016-06-23 06:54:23 +08:00
|
|
|
BTRFS_NODEPTRS_PER_BLOCK(fs_info));
|
2009-09-22 03:55:59 +08:00
|
|
|
}
|
2007-12-11 22:25:06 +08:00
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
eb = path->nodes[wc->level];
|
|
|
|
nritems = btrfs_header_nritems(eb);
|
2009-02-04 22:27:02 +08:00
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
|
|
|
|
if (nread >= wc->reada_count)
|
|
|
|
break;
|
2009-02-04 22:27:02 +08:00
|
|
|
|
2008-08-04 20:20:15 +08:00
|
|
|
cond_resched();
|
2009-09-22 03:55:59 +08:00
|
|
|
bytenr = btrfs_node_blockptr(eb, slot);
|
|
|
|
generation = btrfs_node_ptr_generation(eb, slot);
|
2008-08-04 20:20:15 +08:00
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
if (slot == path->slots[wc->level])
|
|
|
|
goto reada;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
if (wc->stage == UPDATE_BACKREF &&
|
|
|
|
generation <= root->root_key.offset)
|
2009-02-04 22:27:02 +08:00
|
|
|
continue;
|
|
|
|
|
2009-10-09 21:25:16 +08:00
|
|
|
/* We don't lock the tree block, it's OK to be racy here */
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
|
2013-03-08 03:22:04 +08:00
|
|
|
wc->level - 1, 1, &refs,
|
|
|
|
&flags);
|
2012-03-12 23:03:00 +08:00
|
|
|
/* We don't care about errors in readahead. */
|
|
|
|
if (ret < 0)
|
|
|
|
continue;
|
2009-10-09 21:25:16 +08:00
|
|
|
BUG_ON(refs == 0);
|
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
if (wc->stage == DROP_REFERENCE) {
|
|
|
|
if (refs == 1)
|
|
|
|
goto reada;
|
2009-02-04 22:27:02 +08:00
|
|
|
|
2009-10-09 21:25:16 +08:00
|
|
|
if (wc->level == 1 &&
|
|
|
|
(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
|
|
|
|
continue;
|
2009-09-22 03:55:59 +08:00
|
|
|
if (!wc->update_ref ||
|
|
|
|
generation <= root->root_key.offset)
|
|
|
|
continue;
|
|
|
|
btrfs_node_key_to_cpu(eb, &key, slot);
|
|
|
|
ret = btrfs_comp_cpu_keys(&key,
|
|
|
|
&wc->update_progress);
|
|
|
|
if (ret < 0)
|
|
|
|
continue;
|
2009-10-09 21:25:16 +08:00
|
|
|
} else {
|
|
|
|
if (wc->level == 1 &&
|
|
|
|
(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
|
|
|
|
continue;
|
2007-03-27 18:33:00 +08:00
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
reada:
|
2016-06-23 06:54:24 +08:00
|
|
|
readahead_tree_block(fs_info, bytenr);
|
2009-09-22 03:55:59 +08:00
|
|
|
nread++;
|
2007-03-10 19:35:47 +08:00
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
wc->reada_slot = slot;
|
2007-03-10 19:35:47 +08:00
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2008-10-30 02:49:05 +08:00
|
|
|
/*
|
2012-12-26 15:32:17 +08:00
|
|
|
* helper to process tree block while walking down the tree.
|
2009-06-28 09:07:35 +08:00
|
|
|
*
|
|
|
|
* when wc->stage == UPDATE_BACKREF, this function updates
|
|
|
|
* back refs for pointers in the block.
|
|
|
|
*
|
|
|
|
* NOTE: return value 1 means we should stop walking down.
|
2008-10-30 02:49:05 +08:00
|
|
|
*/
|
2009-06-28 09:07:35 +08:00
|
|
|
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_root *root,
|
2009-06-28 09:07:35 +08:00
|
|
|
struct btrfs_path *path,
|
2009-10-09 21:25:16 +08:00
|
|
|
struct walk_control *wc, int lookup_info)
|
2008-10-30 02:49:05 +08:00
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2009-06-28 09:07:35 +08:00
|
|
|
int level = wc->level;
|
|
|
|
struct extent_buffer *eb = path->nodes[level];
|
|
|
|
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
|
2008-10-30 02:49:05 +08:00
|
|
|
int ret;
|
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
if (wc->stage == UPDATE_BACKREF &&
|
|
|
|
btrfs_header_owner(eb) != root->root_key.objectid)
|
|
|
|
return 1;
|
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
/*
|
|
|
|
* when reference count of tree block is 1, it won't increase
|
|
|
|
* again. once full backref flag is set, we never clear it.
|
|
|
|
*/
|
2009-10-09 21:25:16 +08:00
|
|
|
if (lookup_info &&
|
|
|
|
((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
|
|
|
|
(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
|
2009-06-28 09:07:35 +08:00
|
|
|
BUG_ON(!path->locks[level]);
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_lookup_extent_info(trans, fs_info,
|
2013-03-08 03:22:04 +08:00
|
|
|
eb->start, level, 1,
|
2009-06-28 09:07:35 +08:00
|
|
|
&wc->refs[level],
|
|
|
|
&wc->flags[level]);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret == -ENOMEM);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2009-06-28 09:07:35 +08:00
|
|
|
BUG_ON(wc->refs[level] == 0);
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
if (wc->stage == DROP_REFERENCE) {
|
|
|
|
if (wc->refs[level] > 1)
|
|
|
|
return 1;
|
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
if (path->locks[level] && !wc->keep_locks) {
|
2011-07-17 03:23:14 +08:00
|
|
|
btrfs_tree_unlock_rw(eb, path->locks[level]);
|
2009-06-28 09:07:35 +08:00
|
|
|
path->locks[level] = 0;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
/* wc->stage == UPDATE_BACKREF */
|
|
|
|
if (!(wc->flags[level] & flag)) {
|
|
|
|
BUG_ON(!path->locks[level]);
|
2014-07-03 01:54:25 +08:00
|
|
|
ret = btrfs_inc_ref(trans, root, eb, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2014-07-03 01:54:25 +08:00
|
|
|
ret = btrfs_dec_ref(trans, root, eb, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2019-03-20 18:43:36 +08:00
|
|
|
ret = btrfs_set_disk_extent_flags(trans, eb->start,
|
2013-05-10 01:49:30 +08:00
|
|
|
eb->len, flag,
|
|
|
|
btrfs_header_level(eb), 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2009-06-28 09:07:35 +08:00
|
|
|
wc->flags[level] |= flag;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the block is shared by multiple trees, so it's not good to
|
|
|
|
* keep the tree lock
|
|
|
|
*/
|
|
|
|
if (path->locks[level] && level > 0) {
|
2011-07-17 03:23:14 +08:00
|
|
|
btrfs_tree_unlock_rw(eb, path->locks[level]);
|
2009-06-28 09:07:35 +08:00
|
|
|
path->locks[level] = 0;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-02-07 04:46:14 +08:00
|
|
|
/*
|
|
|
|
* This is used to verify a ref exists for this root to deal with a bug where we
|
|
|
|
* would have a drop_progress key that hadn't been updated properly.
|
|
|
|
*/
|
|
|
|
static int check_ref_exists(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root, u64 bytenr, u64 parent,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_extent_inline_ref *iref;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = lookup_extent_backref(trans, path, &iref, bytenr,
|
|
|
|
root->fs_info->nodesize, parent,
|
|
|
|
root->root_key.objectid, level, 0);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
if (ret == -ENOENT)
|
|
|
|
return 0;
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
/*
|
2012-12-26 15:32:17 +08:00
|
|
|
* helper to process tree block pointer.
|
2009-09-22 03:55:59 +08:00
|
|
|
*
|
|
|
|
* when wc->stage == DROP_REFERENCE, this function checks
|
|
|
|
* reference count of the block pointed to. if the block
|
|
|
|
* is shared and we need update back refs for the subtree
|
|
|
|
* rooted at the block, this function changes wc->stage to
|
|
|
|
* UPDATE_BACKREF. if the block is shared and there is no
|
|
|
|
* need to update back, this function drops the reference
|
|
|
|
* to the block.
|
|
|
|
*
|
|
|
|
* NOTE: return value 1 means we should stop walking down.
|
|
|
|
*/
|
|
|
|
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path,
|
2009-10-09 21:25:16 +08:00
|
|
|
struct walk_control *wc, int *lookup_info)
|
2009-09-22 03:55:59 +08:00
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2009-09-22 03:55:59 +08:00
|
|
|
u64 bytenr;
|
|
|
|
u64 generation;
|
|
|
|
u64 parent;
|
|
|
|
struct btrfs_key key;
|
2018-03-29 09:08:11 +08:00
|
|
|
struct btrfs_key first_key;
|
2019-04-04 14:45:36 +08:00
|
|
|
struct btrfs_ref ref = { 0 };
|
2009-09-22 03:55:59 +08:00
|
|
|
struct extent_buffer *next;
|
|
|
|
int level = wc->level;
|
|
|
|
int reada = 0;
|
|
|
|
int ret = 0;
|
2014-07-18 03:39:01 +08:00
|
|
|
bool need_account = false;
|
2009-09-22 03:55:59 +08:00
|
|
|
|
|
|
|
generation = btrfs_node_ptr_generation(path->nodes[level],
|
|
|
|
path->slots[level]);
|
|
|
|
/*
|
|
|
|
* if the lower level block was created before the snapshot
|
|
|
|
* was created, we know there is no need to update back refs
|
|
|
|
* for the subtree
|
|
|
|
*/
|
|
|
|
if (wc->stage == UPDATE_BACKREF &&
|
2009-10-09 21:25:16 +08:00
|
|
|
generation <= root->root_key.offset) {
|
|
|
|
*lookup_info = 1;
|
2009-09-22 03:55:59 +08:00
|
|
|
return 1;
|
2009-10-09 21:25:16 +08:00
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
|
|
|
|
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
|
2018-03-29 09:08:11 +08:00
|
|
|
btrfs_node_key_to_cpu(path->nodes[level], &first_key,
|
|
|
|
path->slots[level]);
|
2009-09-22 03:55:59 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
next = find_extent_buffer(fs_info, bytenr);
|
2009-09-22 03:55:59 +08:00
|
|
|
if (!next) {
|
2016-06-23 06:54:24 +08:00
|
|
|
next = btrfs_find_create_tree_block(fs_info, bytenr);
|
2016-06-07 03:01:23 +08:00
|
|
|
if (IS_ERR(next))
|
|
|
|
return PTR_ERR(next);
|
|
|
|
|
2013-07-06 05:05:38 +08:00
|
|
|
btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
|
|
|
|
level - 1);
|
2009-09-22 03:55:59 +08:00
|
|
|
reada = 1;
|
|
|
|
}
|
|
|
|
btrfs_tree_lock(next);
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(next);
|
2009-09-22 03:55:59 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
|
2009-10-09 21:25:16 +08:00
|
|
|
&wc->refs[level - 1],
|
|
|
|
&wc->flags[level - 1]);
|
2016-09-23 19:23:28 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_unlock;
|
2012-03-12 23:03:00 +08:00
|
|
|
|
2013-03-20 06:41:23 +08:00
|
|
|
if (unlikely(wc->refs[level - 1] == 0)) {
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_err(fs_info, "Missing references.");
|
2016-09-23 19:23:28 +08:00
|
|
|
ret = -EIO;
|
|
|
|
goto out_unlock;
|
2013-03-20 06:41:23 +08:00
|
|
|
}
|
2009-10-09 21:25:16 +08:00
|
|
|
*lookup_info = 0;
|
2009-09-22 03:55:59 +08:00
|
|
|
|
2009-10-09 21:25:16 +08:00
|
|
|
if (wc->stage == DROP_REFERENCE) {
|
2009-09-22 03:55:59 +08:00
|
|
|
if (wc->refs[level - 1] > 1) {
|
2014-07-18 03:39:01 +08:00
|
|
|
need_account = true;
|
2009-10-09 21:25:16 +08:00
|
|
|
if (level == 1 &&
|
|
|
|
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
|
|
|
|
goto skip;
|
|
|
|
|
2009-09-22 03:55:59 +08:00
|
|
|
if (!wc->update_ref ||
|
|
|
|
generation <= root->root_key.offset)
|
|
|
|
goto skip;
|
|
|
|
|
|
|
|
btrfs_node_key_to_cpu(path->nodes[level], &key,
|
|
|
|
path->slots[level]);
|
|
|
|
ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
|
|
|
|
if (ret < 0)
|
|
|
|
goto skip;
|
|
|
|
|
|
|
|
wc->stage = UPDATE_BACKREF;
|
|
|
|
wc->shared_level = level - 1;
|
|
|
|
}
|
2009-10-09 21:25:16 +08:00
|
|
|
} else {
|
|
|
|
if (level == 1 &&
|
|
|
|
(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
|
|
|
|
goto skip;
|
2009-09-22 03:55:59 +08:00
|
|
|
}
|
|
|
|
|
2012-05-06 19:23:47 +08:00
|
|
|
if (!btrfs_buffer_uptodate(next, generation, 0)) {
|
2009-09-22 03:55:59 +08:00
|
|
|
btrfs_tree_unlock(next);
|
|
|
|
free_extent_buffer(next);
|
|
|
|
next = NULL;
|
2009-10-09 21:25:16 +08:00
|
|
|
*lookup_info = 1;
|
2009-09-22 03:55:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!next) {
|
|
|
|
if (reada && level == 1)
|
|
|
|
reada_walk_down(trans, root, wc, path);
|
2018-03-29 09:08:11 +08:00
|
|
|
next = read_tree_block(fs_info, bytenr, generation, level - 1,
|
|
|
|
&first_key);
|
2015-05-25 17:30:15 +08:00
|
|
|
if (IS_ERR(next)) {
|
|
|
|
return PTR_ERR(next);
|
|
|
|
} else if (!extent_buffer_uptodate(next)) {
|
2013-04-24 02:17:42 +08:00
|
|
|
free_extent_buffer(next);
|
2011-03-24 14:33:21 +08:00
|
|
|
return -EIO;
|
2013-04-24 02:17:42 +08:00
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
btrfs_tree_lock(next);
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(next);
|
2009-09-22 03:55:59 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
level--;
|
2016-09-23 19:23:28 +08:00
|
|
|
ASSERT(level == btrfs_header_level(next));
|
|
|
|
if (level != btrfs_header_level(next)) {
|
|
|
|
btrfs_err(root->fs_info, "mismatched level");
|
|
|
|
ret = -EIO;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
path->nodes[level] = next;
|
|
|
|
path->slots[level] = 0;
|
2011-07-17 03:23:14 +08:00
|
|
|
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
|
2009-09-22 03:55:59 +08:00
|
|
|
wc->level = level;
|
|
|
|
if (wc->level == 1)
|
|
|
|
wc->reada_slot = 0;
|
|
|
|
return 0;
|
|
|
|
skip:
|
|
|
|
wc->refs[level - 1] = 0;
|
|
|
|
wc->flags[level - 1] = 0;
|
2009-10-09 21:25:16 +08:00
|
|
|
if (wc->stage == DROP_REFERENCE) {
|
|
|
|
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
|
|
|
|
parent = path->nodes[level]->start;
|
|
|
|
} else {
|
2016-09-23 19:23:28 +08:00
|
|
|
ASSERT(root->root_key.objectid ==
|
2009-10-09 21:25:16 +08:00
|
|
|
btrfs_header_owner(path->nodes[level]));
|
2016-09-23 19:23:28 +08:00
|
|
|
if (root->root_key.objectid !=
|
|
|
|
btrfs_header_owner(path->nodes[level])) {
|
|
|
|
btrfs_err(root->fs_info,
|
|
|
|
"mismatched block owner");
|
|
|
|
ret = -EIO;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2009-10-09 21:25:16 +08:00
|
|
|
parent = 0;
|
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
|
2019-02-07 04:46:14 +08:00
|
|
|
/*
|
|
|
|
* If we had a drop_progress we need to verify the refs are set
|
|
|
|
* as expected. If we find our ref then we know that from here
|
|
|
|
* on out everything should be correct, and we can clear the
|
|
|
|
* ->restarted flag.
|
|
|
|
*/
|
|
|
|
if (wc->restarted) {
|
|
|
|
ret = check_ref_exists(trans, root, bytenr, parent,
|
|
|
|
level - 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out_unlock;
|
|
|
|
if (ret == 0)
|
|
|
|
goto no_delete;
|
|
|
|
ret = 0;
|
|
|
|
wc->restarted = 0;
|
|
|
|
}
|
|
|
|
|
2018-09-27 14:42:33 +08:00
|
|
|
/*
|
|
|
|
* Reloc tree doesn't contribute to qgroup numbers, and we have
|
|
|
|
* already accounted them at merge time (replace_path),
|
|
|
|
* thus we could skip expensive subtree trace here.
|
|
|
|
*/
|
|
|
|
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
|
|
|
|
need_account) {
|
2018-07-18 14:45:38 +08:00
|
|
|
ret = btrfs_qgroup_trace_subtree(trans, next,
|
2016-10-18 09:31:28 +08:00
|
|
|
generation, level - 1);
|
2014-07-18 03:39:01 +08:00
|
|
|
if (ret) {
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_err_rl(fs_info,
|
2016-09-20 22:05:00 +08:00
|
|
|
"Error %d accounting shared subtree. Quota is out of sync, rescan required.",
|
|
|
|
ret);
|
2014-07-18 03:39:01 +08:00
|
|
|
}
|
|
|
|
}
|
btrfs: save drop_progress if we drop refs at all
Previously we only updated the drop_progress key if we were in the
DROP_REFERENCE stage of snapshot deletion. This is because the
UPDATE_BACKREF stage checks the flags of the blocks it's converting to
FULL_BACKREF, so if we go over a block we processed before it doesn't
matter, we just don't do anything.
The problem is in do_walk_down() we will go ahead and drop the roots
reference to any blocks that we know we won't need to walk into.
Given subvolume A and snapshot B. The root of B points to all of the
nodes that belong to A, so all of those nodes have a refcnt > 1. If B
did not modify those blocks it'll hit this condition in do_walk_down
if (!wc->update_ref ||
generation <= root->root_key.offset)
goto skip;
and in "goto skip" we simply do a btrfs_free_extent() for that bytenr
that we point at.
Now assume we modified some data in B, and then took a snapshot of B and
call it C. C points to all the nodes in B, making every node the root
of B points to have a refcnt > 1. This assumes the root level is 2 or
higher.
We delete snapshot B, which does the above work in do_walk_down,
free'ing our ref for nodes we share with A that we didn't modify. Now
we hit a node we _did_ modify, thus we own. We need to walk down into
this node and we set wc->stage == UPDATE_BACKREF. We walk down to level
0 which we also own because we modified data. We can't walk any further
down and thus now need to walk up and start the next part of the
deletion. Now walk_up_proc is supposed to put us back into
DROP_REFERENCE, but there's an exception to this
if (level < wc->shared_level)
goto out;
we are at level == 0, and our shared_level == 1. We skip out of this
one and go up to level 1. Since path->slots[1] < nritems we
path->slots[1]++ and break out of walk_up_tree to stop our transaction
and loop back around. Now in btrfs_drop_snapshot we have this snippet
if (wc->stage == DROP_REFERENCE) {
level = wc->level;
btrfs_node_key(path->nodes[level],
&root_item->drop_progress,
path->slots[level]);
root_item->drop_level = level;
}
our stage == UPDATE_BACKREF still, so we don't update the drop_progress
key. This is a problem because we would have done btrfs_free_extent()
for the nodes leading up to our current position. If we crash or
unmount here and go to remount we'll start over where we were before and
try to free our ref for blocks we've already freed, and thus abort()
out.
Fix this by keeping track of the last place we dropped a reference for
our block in do_walk_down. Then if wc->stage == UPDATE_BACKREF we know
we'll start over from a place we meant to, and otherwise things continue
to work as they did before.
I have a complicated reproducer for this problem, without this patch
we'll fail to fsck the fs when replaying the log writes log. With this
patch we can replay the whole log without any fsck or mount failures.
The steps to reproduce this easily are sort of tricky, I had to add a
couple of debug patches to the kernel in order to make it easy,
basically I just needed to make sure we did actually commit the
transaction every time we finished a walk_down_tree/walk_up_tree combo.
The reproducer:
1) Creates a base subvolume.
2) Creates 100k files in the subvolume.
3) Snapshots the base subvolume (snap1).
4) Touches files 5000-6000 in snap1.
5) Snapshots snap1 (snap2).
6) Deletes snap1.
I do this with dm-log-writes, and then replay to every FUA in the log
and fsck the fs.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ copy reproducer steps ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-02-07 04:46:15 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to update the next key in our walk control so we can
|
|
|
|
* update the drop_progress key accordingly. We don't care if
|
|
|
|
* find_next_key doesn't find a key because that means we're at
|
|
|
|
* the end and are going to clean up now.
|
|
|
|
*/
|
|
|
|
wc->drop_level = level;
|
|
|
|
find_next_key(path, level, &wc->drop_progress);
|
|
|
|
|
2019-04-04 14:45:36 +08:00
|
|
|
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
|
|
|
|
fs_info->nodesize, parent);
|
|
|
|
btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
|
|
|
|
ret = btrfs_free_extent(trans, &ref);
|
2016-09-23 19:23:28 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
2009-09-22 03:55:59 +08:00
|
|
|
}
|
2019-02-07 04:46:14 +08:00
|
|
|
no_delete:
|
2016-09-23 19:23:28 +08:00
|
|
|
*lookup_info = 1;
|
|
|
|
ret = 1;
|
|
|
|
|
|
|
|
out_unlock:
|
2009-09-22 03:55:59 +08:00
|
|
|
btrfs_tree_unlock(next);
|
|
|
|
free_extent_buffer(next);
|
2016-09-23 19:23:28 +08:00
|
|
|
|
|
|
|
return ret;
|
2009-09-22 03:55:59 +08:00
|
|
|
}
|
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
/*
|
2012-12-26 15:32:17 +08:00
|
|
|
* helper to process tree block while walking up the tree.
|
2009-06-28 09:07:35 +08:00
|
|
|
*
|
|
|
|
* when wc->stage == DROP_REFERENCE, this function drops
|
|
|
|
* reference count on the block.
|
|
|
|
*
|
|
|
|
* when wc->stage == UPDATE_BACKREF, this function changes
|
|
|
|
* wc->stage back to DROP_REFERENCE if we changed wc->stage
|
|
|
|
* to UPDATE_BACKREF previously while processing the block.
|
|
|
|
*
|
|
|
|
* NOTE: return value 1 means we should stop walking up.
|
|
|
|
*/
|
|
|
|
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct walk_control *wc)
|
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2010-05-16 22:46:25 +08:00
|
|
|
int ret;
|
2009-06-28 09:07:35 +08:00
|
|
|
int level = wc->level;
|
|
|
|
struct extent_buffer *eb = path->nodes[level];
|
|
|
|
u64 parent = 0;
|
|
|
|
|
|
|
|
if (wc->stage == UPDATE_BACKREF) {
|
|
|
|
BUG_ON(wc->shared_level < level);
|
|
|
|
if (level < wc->shared_level)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = find_next_key(path, level + 1, &wc->update_progress);
|
|
|
|
if (ret > 0)
|
|
|
|
wc->update_ref = 0;
|
|
|
|
|
|
|
|
wc->stage = DROP_REFERENCE;
|
|
|
|
wc->shared_level = -1;
|
|
|
|
path->slots[level] = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check reference count again if the block isn't locked.
|
|
|
|
* we should start walking down the tree again if reference
|
|
|
|
* count is one.
|
|
|
|
*/
|
|
|
|
if (!path->locks[level]) {
|
|
|
|
BUG_ON(level == 0);
|
|
|
|
btrfs_tree_lock(eb);
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(eb);
|
2011-07-17 03:23:14 +08:00
|
|
|
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_lookup_extent_info(trans, fs_info,
|
2013-03-08 03:22:04 +08:00
|
|
|
eb->start, level, 1,
|
2009-06-28 09:07:35 +08:00
|
|
|
&wc->refs[level],
|
|
|
|
&wc->flags[level]);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_tree_unlock_rw(eb, path->locks[level]);
|
2012-12-28 17:33:19 +08:00
|
|
|
path->locks[level] = 0;
|
2012-03-12 23:03:00 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
BUG_ON(wc->refs[level] == 0);
|
|
|
|
if (wc->refs[level] == 1) {
|
2011-07-17 03:23:14 +08:00
|
|
|
btrfs_tree_unlock_rw(eb, path->locks[level]);
|
2012-12-28 17:33:19 +08:00
|
|
|
path->locks[level] = 0;
|
2009-06-28 09:07:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
2008-10-30 02:49:05 +08:00
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
/* wc->stage == DROP_REFERENCE */
|
|
|
|
BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
if (wc->refs[level] == 1) {
|
|
|
|
if (level == 0) {
|
|
|
|
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
|
2014-07-03 01:54:25 +08:00
|
|
|
ret = btrfs_dec_ref(trans, root, eb, 1);
|
2009-06-28 09:07:35 +08:00
|
|
|
else
|
2014-07-03 01:54:25 +08:00
|
|
|
ret = btrfs_dec_ref(trans, root, eb, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2019-04-04 14:45:37 +08:00
|
|
|
if (is_fstree(root->root_key.objectid)) {
|
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_err_rl(fs_info,
|
|
|
|
"error %d accounting leaf items, quota is out of sync, rescan required",
|
2016-09-20 22:05:00 +08:00
|
|
|
ret);
|
2019-04-04 14:45:37 +08:00
|
|
|
}
|
2014-07-18 03:39:01 +08:00
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
2019-03-20 21:30:02 +08:00
|
|
|
/* make block locked assertion in btrfs_clean_tree_block happy */
|
2009-06-28 09:07:35 +08:00
|
|
|
if (!path->locks[level] &&
|
|
|
|
btrfs_header_generation(eb) == trans->transid) {
|
|
|
|
btrfs_tree_lock(eb);
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(eb);
|
2011-07-17 03:23:14 +08:00
|
|
|
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
2019-03-20 21:30:02 +08:00
|
|
|
btrfs_clean_tree_block(eb);
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (eb == root->node) {
|
|
|
|
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
|
|
|
|
parent = eb->start;
|
2018-08-21 09:42:03 +08:00
|
|
|
else if (root->root_key.objectid != btrfs_header_owner(eb))
|
|
|
|
goto owner_mismatch;
|
2009-06-28 09:07:35 +08:00
|
|
|
} else {
|
|
|
|
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
|
|
|
|
parent = path->nodes[level + 1]->start;
|
2018-08-21 09:42:03 +08:00
|
|
|
else if (root->root_key.objectid !=
|
|
|
|
btrfs_header_owner(path->nodes[level + 1]))
|
|
|
|
goto owner_mismatch;
|
2008-10-30 02:49:05 +08:00
|
|
|
}
|
|
|
|
|
2012-05-16 23:04:52 +08:00
|
|
|
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
|
2009-06-28 09:07:35 +08:00
|
|
|
out:
|
|
|
|
wc->refs[level] = 0;
|
|
|
|
wc->flags[level] = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
return 0;
|
2018-08-21 09:42:03 +08:00
|
|
|
|
|
|
|
owner_mismatch:
|
|
|
|
btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
|
|
|
|
btrfs_header_owner(eb), root->root_key.objectid);
|
|
|
|
return -EUCLEAN;
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct walk_control *wc)
|
|
|
|
{
|
|
|
|
int level = wc->level;
|
2009-10-09 21:25:16 +08:00
|
|
|
int lookup_info = 1;
|
2009-06-28 09:07:35 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (level >= 0) {
|
2009-10-09 21:25:16 +08:00
|
|
|
ret = walk_down_proc(trans, root, path, wc, lookup_info);
|
2009-06-28 09:07:35 +08:00
|
|
|
if (ret > 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (level == 0)
|
|
|
|
break;
|
|
|
|
|
2010-02-01 10:41:17 +08:00
|
|
|
if (path->slots[level] >=
|
|
|
|
btrfs_header_nritems(path->nodes[level]))
|
|
|
|
break;
|
|
|
|
|
2009-10-09 21:25:16 +08:00
|
|
|
ret = do_walk_down(trans, root, path, wc, &lookup_info);
|
2009-09-22 03:55:59 +08:00
|
|
|
if (ret > 0) {
|
|
|
|
path->slots[level]++;
|
|
|
|
continue;
|
2010-03-25 20:37:12 +08:00
|
|
|
} else if (ret < 0)
|
|
|
|
return ret;
|
2009-09-22 03:55:59 +08:00
|
|
|
level = wc->level;
|
2008-10-30 02:49:05 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
|
2008-01-03 23:01:48 +08:00
|
|
|
struct btrfs_root *root,
|
2008-10-30 02:49:05 +08:00
|
|
|
struct btrfs_path *path,
|
2009-06-28 09:07:35 +08:00
|
|
|
struct walk_control *wc, int max_level)
|
2007-03-10 19:35:47 +08:00
|
|
|
{
|
2009-06-28 09:07:35 +08:00
|
|
|
int level = wc->level;
|
2007-03-10 19:35:47 +08:00
|
|
|
int ret;
|
2007-08-08 03:52:19 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
path->slots[level] = btrfs_header_nritems(path->nodes[level]);
|
|
|
|
while (level < max_level && path->nodes[level]) {
|
|
|
|
wc->level = level;
|
|
|
|
if (path->slots[level] + 1 <
|
|
|
|
btrfs_header_nritems(path->nodes[level])) {
|
|
|
|
path->slots[level]++;
|
2007-03-10 19:35:47 +08:00
|
|
|
return 0;
|
|
|
|
} else {
|
2009-06-28 09:07:35 +08:00
|
|
|
ret = walk_up_proc(trans, root, path, wc);
|
|
|
|
if (ret > 0)
|
|
|
|
return 0;
|
2018-08-21 09:42:03 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2009-02-04 22:27:02 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
if (path->locks[level]) {
|
2011-07-17 03:23:14 +08:00
|
|
|
btrfs_tree_unlock_rw(path->nodes[level],
|
|
|
|
path->locks[level]);
|
2009-06-28 09:07:35 +08:00
|
|
|
path->locks[level] = 0;
|
2008-10-30 02:49:05 +08:00
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
free_extent_buffer(path->nodes[level]);
|
|
|
|
path->nodes[level] = NULL;
|
|
|
|
level++;
|
2007-03-10 19:35:47 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-03-13 23:09:37 +08:00
|
|
|
/*
|
2009-06-28 09:07:35 +08:00
|
|
|
* drop a subvolume tree.
|
|
|
|
*
|
|
|
|
* this function traverses the tree freeing any blocks that only
|
|
|
|
* referenced by the tree.
|
|
|
|
*
|
|
|
|
* when a shared tree block is found. this function decreases its
|
|
|
|
* reference count by one. if update_ref is true, this function
|
|
|
|
* also make sure backrefs for the shared block and all lower level
|
|
|
|
* blocks are properly updated.
|
2013-03-12 23:13:28 +08:00
|
|
|
*
|
|
|
|
* If called with for_reloc == 0, may exit early with -EAGAIN
|
2007-03-13 23:09:37 +08:00
|
|
|
*/
|
2011-10-04 11:22:41 +08:00
|
|
|
int btrfs_drop_snapshot(struct btrfs_root *root,
|
2011-09-12 21:26:38 +08:00
|
|
|
struct btrfs_block_rsv *block_rsv, int update_ref,
|
|
|
|
int for_reloc)
|
2007-03-10 19:35:47 +08:00
|
|
|
{
|
2016-09-20 22:05:02 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2007-04-02 23:20:42 +08:00
|
|
|
struct btrfs_path *path;
|
2009-06-28 09:07:35 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
2016-09-20 22:05:02 +08:00
|
|
|
struct btrfs_root *tree_root = fs_info->tree_root;
|
2007-08-08 03:52:19 +08:00
|
|
|
struct btrfs_root_item *root_item = &root->root_item;
|
2009-06-28 09:07:35 +08:00
|
|
|
struct walk_control *wc;
|
|
|
|
struct btrfs_key key;
|
|
|
|
int err = 0;
|
|
|
|
int ret;
|
|
|
|
int level;
|
2013-07-18 07:30:20 +08:00
|
|
|
bool root_dropped = false;
|
2007-03-10 19:35:47 +08:00
|
|
|
|
2018-08-06 13:25:24 +08:00
|
|
|
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
|
2014-07-18 03:39:01 +08:00
|
|
|
|
2007-04-02 23:20:42 +08:00
|
|
|
path = btrfs_alloc_path();
|
2011-08-09 15:11:13 +08:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-03-10 19:35:47 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
wc = kzalloc(sizeof(*wc), GFP_NOFS);
|
2011-07-14 01:59:59 +08:00
|
|
|
if (!wc) {
|
|
|
|
btrfs_free_path(path);
|
2011-08-09 15:11:13 +08:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
2011-07-14 01:59:59 +08:00
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
trans = btrfs_start_transaction(tree_root, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out_free;
|
|
|
|
}
|
2011-01-20 14:19:37 +08:00
|
|
|
|
btrfs: run delayed items before dropping the snapshot
With my delayed refs patches in place we started seeing a large amount
of aborts in __btrfs_free_extent:
BTRFS error (device sdb1): unable to find ref byte nr 91947008 parent 0 root 35964 owner 1 offset 0
Call Trace:
? btrfs_merge_delayed_refs+0xaf/0x340
__btrfs_run_delayed_refs+0x6ea/0xfc0
? btrfs_set_path_blocking+0x31/0x60
btrfs_run_delayed_refs+0xeb/0x180
btrfs_commit_transaction+0x179/0x7f0
? btrfs_check_space_for_delayed_refs+0x30/0x50
? should_end_transaction.isra.19+0xe/0x40
btrfs_drop_snapshot+0x41c/0x7c0
btrfs_clean_one_deleted_snapshot+0xb5/0xd0
cleaner_kthread+0xf6/0x120
kthread+0xf8/0x130
? btree_invalidatepage+0x90/0x90
? kthread_bind+0x10/0x10
ret_from_fork+0x35/0x40
This was because btrfs_drop_snapshot depends on the root not being
modified while it's dropping the snapshot. It will unlock the root node
(and really every node) as it walks down the tree, only to re-lock it
when it needs to do something. This is a problem because if we modify
the tree we could cow a block in our path, which frees our reference to
that block. Then once we get back to that shared block we'll free our
reference to it again, and get ENOENT when trying to lookup our extent
reference to that block in __btrfs_free_extent.
This is ultimately happening because we have delayed items left to be
processed for our deleted snapshot _after_ all of the inodes are closed
for the snapshot. We only run the delayed inode item if we're deleting
the inode, and even then we do not run the delayed insertions or delayed
removals. These can be run at any point after our final inode does its
last iput, which is what triggers the snapshot deletion. We can end up
with the snapshot deletion happening and then have the delayed items run
on that file system, resulting in the above problem.
This problem has existed forever, however my patches made it much easier
to hit as I wake up the cleaner much more often to deal with delayed
iputs, which made us more likely to start the snapshot dropping work
before the transaction commits, which is when the delayed items would
generally be run. Before, generally speaking, we would run the delayed
items, commit the transaction, and wakeup the cleaner thread to start
deleting snapshots, which means we were less likely to hit this problem.
You could still hit it if you had multiple snapshots to be deleted and
ended up with lots of delayed items, but it was definitely harder.
Fix for now by simply running all the delayed items before starting to
drop the snapshot. We could make this smarter in the future by making
the delayed items per-root, and then simply drop any delayed items for
roots that we are going to delete. But for now just a quick and easy
solution is the safest.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-01 00:52:14 +08:00
|
|
|
err = btrfs_run_delayed_items(trans);
|
|
|
|
if (err)
|
|
|
|
goto out_end_trans;
|
|
|
|
|
2010-05-16 22:49:59 +08:00
|
|
|
if (block_rsv)
|
|
|
|
trans->block_rsv = block_rsv;
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2018-12-01 00:52:13 +08:00
|
|
|
/*
|
|
|
|
* This will help us catch people modifying the fs tree while we're
|
|
|
|
* dropping it. It is unsafe to mess with the fs tree while it's being
|
|
|
|
* dropped as we unlock the root node and parent nodes as we walk down
|
|
|
|
* the tree, assuming nothing will change. If something does change
|
|
|
|
* then we'll have stale information and drop references to blocks we've
|
|
|
|
* already dropped.
|
|
|
|
*/
|
|
|
|
set_bit(BTRFS_ROOT_DELETING, &root->state);
|
2007-08-08 03:52:19 +08:00
|
|
|
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
|
2009-06-28 09:07:35 +08:00
|
|
|
level = btrfs_header_level(root->node);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
path->nodes[level] = btrfs_lock_root_node(root);
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(path->nodes[level]);
|
2007-08-08 03:52:19 +08:00
|
|
|
path->slots[level] = 0;
|
2011-07-17 03:23:14 +08:00
|
|
|
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
|
2009-06-28 09:07:35 +08:00
|
|
|
memset(&wc->update_progress, 0,
|
|
|
|
sizeof(wc->update_progress));
|
2007-08-08 03:52:19 +08:00
|
|
|
} else {
|
|
|
|
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
|
2009-06-28 09:07:35 +08:00
|
|
|
memcpy(&wc->update_progress, &key,
|
|
|
|
sizeof(wc->update_progress));
|
|
|
|
|
2007-08-08 04:15:09 +08:00
|
|
|
level = root_item->drop_level;
|
2009-06-28 09:07:35 +08:00
|
|
|
BUG_ON(level == 0);
|
2007-08-08 04:15:09 +08:00
|
|
|
path->lowest_level = level;
|
2009-06-28 09:07:35 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
path->lowest_level = 0;
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
2012-03-12 23:03:00 +08:00
|
|
|
goto out_end_trans;
|
2007-08-08 03:52:19 +08:00
|
|
|
}
|
2009-09-22 03:55:59 +08:00
|
|
|
WARN_ON(ret > 0);
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2008-07-09 02:19:17 +08:00
|
|
|
/*
|
|
|
|
* unlock our path, this is safe because only this
|
|
|
|
* function is allowed to delete this snapshot
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
btrfs_unlock_up_safe(path, 0);
|
2009-06-28 09:07:35 +08:00
|
|
|
|
|
|
|
level = btrfs_header_level(root->node);
|
|
|
|
while (1) {
|
|
|
|
btrfs_tree_lock(path->nodes[level]);
|
2018-04-04 08:03:48 +08:00
|
|
|
btrfs_set_lock_blocking_write(path->nodes[level]);
|
2013-07-16 00:41:42 +08:00
|
|
|
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_lookup_extent_info(trans, fs_info,
|
2009-06-28 09:07:35 +08:00
|
|
|
path->nodes[level]->start,
|
2013-03-08 03:22:04 +08:00
|
|
|
level, 1, &wc->refs[level],
|
2009-06-28 09:07:35 +08:00
|
|
|
&wc->flags[level]);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out_end_trans;
|
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
BUG_ON(wc->refs[level] == 0);
|
|
|
|
|
|
|
|
if (level == root_item->drop_level)
|
|
|
|
break;
|
|
|
|
|
|
|
|
btrfs_tree_unlock(path->nodes[level]);
|
2013-07-16 00:41:42 +08:00
|
|
|
path->locks[level] = 0;
|
2009-06-28 09:07:35 +08:00
|
|
|
WARN_ON(wc->refs[level] != 1);
|
|
|
|
level--;
|
|
|
|
}
|
2007-08-08 03:52:19 +08:00
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2019-02-07 04:46:14 +08:00
|
|
|
wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
|
2009-06-28 09:07:35 +08:00
|
|
|
wc->level = level;
|
|
|
|
wc->shared_level = -1;
|
|
|
|
wc->stage = DROP_REFERENCE;
|
|
|
|
wc->update_ref = update_ref;
|
|
|
|
wc->keep_locks = 0;
|
2016-06-23 06:54:23 +08:00
|
|
|
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2013-03-12 23:13:28 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
ret = walk_down_tree(trans, root, path, wc);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
2007-03-10 19:35:47 +08:00
|
|
|
break;
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
2007-03-13 23:09:37 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
2007-03-10 19:35:47 +08:00
|
|
|
break;
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
BUG_ON(wc->stage != DROP_REFERENCE);
|
2008-06-26 04:01:31 +08:00
|
|
|
break;
|
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
|
|
|
if (wc->stage == DROP_REFERENCE) {
|
btrfs: save drop_progress if we drop refs at all
Previously we only updated the drop_progress key if we were in the
DROP_REFERENCE stage of snapshot deletion. This is because the
UPDATE_BACKREF stage checks the flags of the blocks it's converting to
FULL_BACKREF, so if we go over a block we processed before it doesn't
matter, we just don't do anything.
The problem is in do_walk_down() we will go ahead and drop the roots
reference to any blocks that we know we won't need to walk into.
Given subvolume A and snapshot B. The root of B points to all of the
nodes that belong to A, so all of those nodes have a refcnt > 1. If B
did not modify those blocks it'll hit this condition in do_walk_down
if (!wc->update_ref ||
generation <= root->root_key.offset)
goto skip;
and in "goto skip" we simply do a btrfs_free_extent() for that bytenr
that we point at.
Now assume we modified some data in B, and then took a snapshot of B and
call it C. C points to all the nodes in B, making every node the root
of B points to have a refcnt > 1. This assumes the root level is 2 or
higher.
We delete snapshot B, which does the above work in do_walk_down,
free'ing our ref for nodes we share with A that we didn't modify. Now
we hit a node we _did_ modify, thus we own. We need to walk down into
this node and we set wc->stage == UPDATE_BACKREF. We walk down to level
0 which we also own because we modified data. We can't walk any further
down and thus now need to walk up and start the next part of the
deletion. Now walk_up_proc is supposed to put us back into
DROP_REFERENCE, but there's an exception to this
if (level < wc->shared_level)
goto out;
we are at level == 0, and our shared_level == 1. We skip out of this
one and go up to level 1. Since path->slots[1] < nritems we
path->slots[1]++ and break out of walk_up_tree to stop our transaction
and loop back around. Now in btrfs_drop_snapshot we have this snippet
if (wc->stage == DROP_REFERENCE) {
level = wc->level;
btrfs_node_key(path->nodes[level],
&root_item->drop_progress,
path->slots[level]);
root_item->drop_level = level;
}
our stage == UPDATE_BACKREF still, so we don't update the drop_progress
key. This is a problem because we would have done btrfs_free_extent()
for the nodes leading up to our current position. If we crash or
unmount here and go to remount we'll start over where we were before and
try to free our ref for blocks we've already freed, and thus abort()
out.
Fix this by keeping track of the last place we dropped a reference for
our block in do_walk_down. Then if wc->stage == UPDATE_BACKREF we know
we'll start over from a place we meant to, and otherwise things continue
to work as they did before.
I have a complicated reproducer for this problem, without this patch
we'll fail to fsck the fs when replaying the log writes log. With this
patch we can replay the whole log without any fsck or mount failures.
The steps to reproduce this easily are sort of tricky, I had to add a
couple of debug patches to the kernel in order to make it easy,
basically I just needed to make sure we did actually commit the
transaction every time we finished a walk_down_tree/walk_up_tree combo.
The reproducer:
1) Creates a base subvolume.
2) Creates 100k files in the subvolume.
3) Snapshots the base subvolume (snap1).
4) Touches files 5000-6000 in snap1.
5) Snapshots snap1 (snap2).
6) Deletes snap1.
I do this with dm-log-writes, and then replay to every FUA in the log
and fsck the fs.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ copy reproducer steps ]
Signed-off-by: David Sterba <dsterba@suse.com>
2019-02-07 04:46:15 +08:00
|
|
|
wc->drop_level = wc->level;
|
|
|
|
btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
|
|
|
|
&wc->drop_progress,
|
|
|
|
path->slots[wc->drop_level]);
|
|
|
|
}
|
|
|
|
btrfs_cpu_key_to_disk(&root_item->drop_progress,
|
|
|
|
&wc->drop_progress);
|
|
|
|
root_item->drop_level = wc->drop_level;
|
2009-06-28 09:07:35 +08:00
|
|
|
|
|
|
|
BUG_ON(wc->level == 0);
|
2016-09-10 09:39:03 +08:00
|
|
|
if (btrfs_should_end_transaction(trans) ||
|
2016-06-23 06:54:24 +08:00
|
|
|
(!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
|
2009-06-28 09:07:35 +08:00
|
|
|
ret = btrfs_update_root(trans, tree_root,
|
|
|
|
&root->root_key,
|
|
|
|
root_item);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 23:03:00 +08:00
|
|
|
err = ret;
|
|
|
|
goto out_end_trans;
|
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2016-09-10 09:39:03 +08:00
|
|
|
btrfs_end_transaction_throttle(trans);
|
2016-06-23 06:54:24 +08:00
|
|
|
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
|
2016-09-20 22:05:02 +08:00
|
|
|
btrfs_debug(fs_info,
|
|
|
|
"drop snapshot early exit");
|
2013-07-15 23:57:06 +08:00
|
|
|
err = -EAGAIN;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
trans = btrfs_start_transaction(tree_root, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out_free;
|
|
|
|
}
|
2010-05-16 22:49:59 +08:00
|
|
|
if (block_rsv)
|
|
|
|
trans->block_rsv = block_rsv;
|
2009-03-13 22:17:05 +08:00
|
|
|
}
|
2007-03-10 19:35:47 +08:00
|
|
|
}
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (err)
|
|
|
|
goto out_end_trans;
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2018-08-01 11:32:27 +08:00
|
|
|
ret = btrfs_del_root(trans, &root->root_key);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2017-12-05 02:11:45 +08:00
|
|
|
err = ret;
|
2012-03-12 23:03:00 +08:00
|
|
|
goto out_end_trans;
|
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2009-09-22 04:00:26 +08:00
|
|
|
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
|
2013-05-15 15:48:19 +08:00
|
|
|
ret = btrfs_find_root(tree_root, &root->root_key, path,
|
|
|
|
NULL, NULL);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret < 0) {
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-03-12 23:03:00 +08:00
|
|
|
err = ret;
|
|
|
|
goto out_end_trans;
|
|
|
|
} else if (ret > 0) {
|
2010-12-09 01:24:01 +08:00
|
|
|
/* if we fail to delete the orphan item this time
|
|
|
|
* around, it'll get picked up the next time.
|
|
|
|
*
|
|
|
|
* The most common failure here is just -ENOENT.
|
|
|
|
*/
|
|
|
|
btrfs_del_orphan_item(trans, tree_root,
|
|
|
|
root->root_key.objectid);
|
2009-09-22 04:00:26 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-04-02 19:51:05 +08:00
|
|
|
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
|
2015-09-15 22:07:04 +08:00
|
|
|
btrfs_add_dropped_root(trans, root);
|
2009-09-22 04:00:26 +08:00
|
|
|
} else {
|
|
|
|
free_extent_buffer(root->node);
|
|
|
|
free_extent_buffer(root->commit_root);
|
2013-05-15 15:48:20 +08:00
|
|
|
btrfs_put_fs_root(root);
|
2009-09-22 04:00:26 +08:00
|
|
|
}
|
2013-07-18 07:30:20 +08:00
|
|
|
root_dropped = true;
|
2012-03-12 23:03:00 +08:00
|
|
|
out_end_trans:
|
2016-09-10 09:39:03 +08:00
|
|
|
btrfs_end_transaction_throttle(trans);
|
2012-03-12 23:03:00 +08:00
|
|
|
out_free:
|
2009-06-28 09:07:35 +08:00
|
|
|
kfree(wc);
|
2007-04-02 23:20:42 +08:00
|
|
|
btrfs_free_path(path);
|
2011-08-09 15:11:13 +08:00
|
|
|
out:
|
2013-07-18 07:30:20 +08:00
|
|
|
/*
|
|
|
|
* So if we need to stop dropping the snapshot for whatever reason we
|
|
|
|
* need to make sure to add it back to the dead root list so that we
|
|
|
|
* keep trying to do the work later. This also cleans up roots if we
|
|
|
|
* don't have it in the radix (like when we recover after a power fail
|
|
|
|
* or unmount) so we don't leak memory.
|
|
|
|
*/
|
2017-10-07 22:02:21 +08:00
|
|
|
if (!for_reloc && !root_dropped)
|
2013-07-18 07:30:20 +08:00
|
|
|
btrfs_add_dead_root(root);
|
2014-01-07 17:26:58 +08:00
|
|
|
if (err && err != -EAGAIN)
|
2016-09-20 22:05:02 +08:00
|
|
|
btrfs_handle_fs_error(fs_info, err, NULL);
|
2011-10-04 11:22:41 +08:00
|
|
|
return err;
|
2007-03-10 19:35:47 +08:00
|
|
|
}
|
2007-04-27 04:46:15 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
/*
|
|
|
|
* drop subtree rooted at tree block 'node'.
|
|
|
|
*
|
|
|
|
* NOTE: this function will unlock and release tree block 'node'
|
2011-09-12 21:26:38 +08:00
|
|
|
* only used by relocation code
|
2009-06-28 09:07:35 +08:00
|
|
|
*/
|
2008-10-30 02:49:05 +08:00
|
|
|
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct extent_buffer *node,
|
|
|
|
struct extent_buffer *parent)
|
|
|
|
{
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2008-10-30 02:49:05 +08:00
|
|
|
struct btrfs_path *path;
|
2009-06-28 09:07:35 +08:00
|
|
|
struct walk_control *wc;
|
2008-10-30 02:49:05 +08:00
|
|
|
int level;
|
|
|
|
int parent_level;
|
|
|
|
int ret = 0;
|
|
|
|
int wret;
|
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
|
|
|
|
|
2008-10-30 02:49:05 +08:00
|
|
|
path = btrfs_alloc_path();
|
2011-03-23 16:14:16 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
wc = kzalloc(sizeof(*wc), GFP_NOFS);
|
2011-03-23 16:14:16 +08:00
|
|
|
if (!wc) {
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2009-06-28 09:07:35 +08:00
|
|
|
|
2009-03-09 23:45:38 +08:00
|
|
|
btrfs_assert_tree_locked(parent);
|
2008-10-30 02:49:05 +08:00
|
|
|
parent_level = btrfs_header_level(parent);
|
|
|
|
extent_buffer_get(parent);
|
|
|
|
path->nodes[parent_level] = parent;
|
|
|
|
path->slots[parent_level] = btrfs_header_nritems(parent);
|
|
|
|
|
2009-03-09 23:45:38 +08:00
|
|
|
btrfs_assert_tree_locked(node);
|
2008-10-30 02:49:05 +08:00
|
|
|
level = btrfs_header_level(node);
|
|
|
|
path->nodes[level] = node;
|
|
|
|
path->slots[level] = 0;
|
2011-07-17 03:23:14 +08:00
|
|
|
path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
|
2009-06-28 09:07:35 +08:00
|
|
|
|
|
|
|
wc->refs[parent_level] = 1;
|
|
|
|
wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
|
|
|
|
wc->level = level;
|
|
|
|
wc->shared_level = -1;
|
|
|
|
wc->stage = DROP_REFERENCE;
|
|
|
|
wc->update_ref = 0;
|
|
|
|
wc->keep_locks = 1;
|
2016-06-23 06:54:23 +08:00
|
|
|
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
|
2008-10-30 02:49:05 +08:00
|
|
|
|
|
|
|
while (1) {
|
2009-06-28 09:07:35 +08:00
|
|
|
wret = walk_down_tree(trans, root, path, wc);
|
|
|
|
if (wret < 0) {
|
2008-10-30 02:49:05 +08:00
|
|
|
ret = wret;
|
|
|
|
break;
|
2009-06-28 09:07:35 +08:00
|
|
|
}
|
2008-10-30 02:49:05 +08:00
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
wret = walk_up_tree(trans, root, path, wc, parent_level);
|
2008-10-30 02:49:05 +08:00
|
|
|
if (wret < 0)
|
|
|
|
ret = wret;
|
|
|
|
if (wret != 0)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2009-06-28 09:07:35 +08:00
|
|
|
kfree(wc);
|
2008-10-30 02:49:05 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:22 +08:00
|
|
|
static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
|
2008-04-29 03:29:52 +08:00
|
|
|
{
|
|
|
|
u64 num_devices;
|
2012-03-27 22:09:17 +08:00
|
|
|
u64 stripped;
|
2012-01-17 04:04:48 +08:00
|
|
|
|
2012-03-27 22:09:17 +08:00
|
|
|
/*
|
|
|
|
* if restripe for this chunk_type is on pick target profile and
|
|
|
|
* return, otherwise do the usual balance
|
|
|
|
*/
|
2016-06-23 06:54:22 +08:00
|
|
|
stripped = get_restripe_target(fs_info, flags);
|
2012-03-27 22:09:17 +08:00
|
|
|
if (stripped)
|
|
|
|
return extended_to_chunk(stripped);
|
2012-01-17 04:04:48 +08:00
|
|
|
|
2016-06-23 06:54:22 +08:00
|
|
|
num_devices = fs_info->fs_devices->rw_devices;
|
2010-12-14 03:56:23 +08:00
|
|
|
|
2019-05-31 22:54:26 +08:00
|
|
|
stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK |
|
2019-05-31 21:39:31 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10;
|
2012-03-27 22:09:17 +08:00
|
|
|
|
2008-04-29 03:29:52 +08:00
|
|
|
if (num_devices == 1) {
|
|
|
|
stripped |= BTRFS_BLOCK_GROUP_DUP;
|
|
|
|
stripped = flags & ~stripped;
|
|
|
|
|
|
|
|
/* turn raid0 into single device chunks */
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_RAID0)
|
|
|
|
return stripped;
|
|
|
|
|
|
|
|
/* turn mirroring into duplication */
|
2019-05-31 21:39:31 +08:00
|
|
|
if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK |
|
2008-04-29 03:29:52 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID10))
|
|
|
|
return stripped | BTRFS_BLOCK_GROUP_DUP;
|
|
|
|
} else {
|
|
|
|
/* they already had raid on here, just return */
|
|
|
|
if (flags & stripped)
|
|
|
|
return flags;
|
|
|
|
|
|
|
|
stripped |= BTRFS_BLOCK_GROUP_DUP;
|
|
|
|
stripped = flags & ~stripped;
|
|
|
|
|
|
|
|
/* switch duplicated blocks with raid1 */
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_DUP)
|
|
|
|
return stripped | BTRFS_BLOCK_GROUP_RAID1;
|
|
|
|
|
2012-03-27 22:09:16 +08:00
|
|
|
/* this is drive concat, leave it alone */
|
2008-04-29 03:29:52 +08:00
|
|
|
}
|
2012-03-27 22:09:16 +08:00
|
|
|
|
2008-04-29 03:29:52 +08:00
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
2015-08-05 16:43:27 +08:00
|
|
|
static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
|
2008-05-25 02:04:53 +08:00
|
|
|
{
|
2010-05-16 22:46:25 +08:00
|
|
|
struct btrfs_space_info *sinfo = cache->space_info;
|
|
|
|
u64 num_bytes;
|
2019-01-30 13:07:51 +08:00
|
|
|
u64 sinfo_used;
|
2011-07-15 18:34:36 +08:00
|
|
|
u64 min_allocable_bytes;
|
2010-05-16 22:46:25 +08:00
|
|
|
int ret = -ENOSPC;
|
2008-05-25 02:04:53 +08:00
|
|
|
|
2011-07-15 18:34:36 +08:00
|
|
|
/*
|
|
|
|
* We need some metadata space and system metadata space for
|
|
|
|
* allocating chunks in some corner cases until we force to set
|
|
|
|
* it to be readonly.
|
|
|
|
*/
|
|
|
|
if ((sinfo->flags &
|
|
|
|
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
|
|
|
|
!force)
|
2015-12-15 00:42:10 +08:00
|
|
|
min_allocable_bytes = SZ_1M;
|
2011-07-15 18:34:36 +08:00
|
|
|
else
|
|
|
|
min_allocable_bytes = 0;
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
spin_lock(&sinfo->lock);
|
|
|
|
spin_lock(&cache->lock);
|
2011-07-26 11:30:11 +08:00
|
|
|
|
|
|
|
if (cache->ro) {
|
2015-08-05 16:43:27 +08:00
|
|
|
cache->ro++;
|
2011-07-26 11:30:11 +08:00
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
|
|
|
|
cache->bytes_super - btrfs_block_group_used(&cache->item);
|
2019-01-30 13:07:51 +08:00
|
|
|
sinfo_used = btrfs_space_info_used(sinfo, true);
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2019-01-30 13:07:51 +08:00
|
|
|
if (sinfo_used + num_bytes + min_allocable_bytes <=
|
|
|
|
sinfo->total_bytes) {
|
2010-05-16 22:46:25 +08:00
|
|
|
sinfo->bytes_readonly += num_bytes;
|
2015-08-05 16:43:27 +08:00
|
|
|
cache->ro++;
|
2014-10-31 21:49:34 +08:00
|
|
|
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
|
2010-05-16 22:46:25 +08:00
|
|
|
ret = 0;
|
|
|
|
}
|
2011-07-26 11:30:11 +08:00
|
|
|
out:
|
2010-05-16 22:46:25 +08:00
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&sinfo->lock);
|
2019-01-30 13:07:51 +08:00
|
|
|
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
|
|
|
|
btrfs_info(cache->fs_info,
|
|
|
|
"unable to make block group %llu ro",
|
|
|
|
cache->key.objectid);
|
|
|
|
btrfs_info(cache->fs_info,
|
|
|
|
"sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
|
|
|
|
sinfo_used, num_bytes, min_allocable_bytes);
|
2019-06-19 04:09:24 +08:00
|
|
|
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
|
2019-01-30 13:07:51 +08:00
|
|
|
}
|
2010-05-16 22:46:25 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2008-07-09 02:19:17 +08:00
|
|
|
|
2018-06-20 20:49:14 +08:00
|
|
|
int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
|
2008-07-23 11:06:41 +08:00
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
{
|
2018-06-20 20:49:14 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
2010-05-16 22:46:25 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
u64 alloc_flags;
|
|
|
|
int ret;
|
2008-07-09 02:19:17 +08:00
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
again:
|
2017-02-16 05:28:29 +08:00
|
|
|
trans = btrfs_join_transaction(fs_info->extent_root);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
/*
|
|
|
|
* we're not allowed to set block groups readonly after the dirty
|
|
|
|
* block groups cache has started writing. If it already started,
|
|
|
|
* back off and let this transaction commit
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_lock(&fs_info->ro_block_group_mutex);
|
2015-09-24 22:46:10 +08:00
|
|
|
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
|
2015-04-07 03:46:08 +08:00
|
|
|
u64 transid = trans->transid;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_unlock(&fs_info->ro_block_group_mutex);
|
2016-09-10 09:39:03 +08:00
|
|
|
btrfs_end_transaction(trans);
|
2015-04-07 03:46:08 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_wait_for_commit(fs_info, transid);
|
2015-04-07 03:46:08 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2015-05-20 09:54:41 +08:00
|
|
|
/*
|
|
|
|
* if we are changing raid levels, try to allocate a corresponding
|
|
|
|
* block group with the new raid level.
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
alloc_flags = update_block_group_flags(fs_info, cache->flags);
|
2015-05-20 09:54:41 +08:00
|
|
|
if (alloc_flags != cache->flags) {
|
2019-06-19 04:09:17 +08:00
|
|
|
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
|
2015-05-20 09:54:41 +08:00
|
|
|
/*
|
|
|
|
* ENOSPC is allowed here, we may have enough space
|
|
|
|
* already allocated at the new raid level to
|
|
|
|
* carry on
|
|
|
|
*/
|
|
|
|
if (ret == -ENOSPC)
|
|
|
|
ret = 0;
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2015-04-07 03:46:08 +08:00
|
|
|
|
2015-08-05 16:43:27 +08:00
|
|
|
ret = inc_block_group_ro(cache, 0);
|
2010-05-16 22:46:25 +08:00
|
|
|
if (!ret)
|
|
|
|
goto out;
|
2016-06-23 06:54:24 +08:00
|
|
|
alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
|
2019-06-19 04:09:17 +08:00
|
|
|
ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
|
2010-05-16 22:46:25 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2015-08-05 16:43:27 +08:00
|
|
|
ret = inc_block_group_ro(cache, 0);
|
2010-05-16 22:46:25 +08:00
|
|
|
out:
|
2015-01-10 02:40:15 +08:00
|
|
|
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
|
2016-06-23 06:54:23 +08:00
|
|
|
alloc_flags = update_block_group_flags(fs_info, cache->flags);
|
2016-10-05 01:34:27 +08:00
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
2018-06-20 20:49:07 +08:00
|
|
|
check_system_chunk(trans, alloc_flags);
|
2016-10-05 01:34:27 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2015-01-10 02:40:15 +08:00
|
|
|
}
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_unlock(&fs_info->ro_block_group_mutex);
|
2015-01-10 02:40:15 +08:00
|
|
|
|
2016-09-10 09:39:03 +08:00
|
|
|
btrfs_end_transaction(trans);
|
2010-05-16 22:46:25 +08:00
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2018-06-20 20:49:15 +08:00
|
|
|
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
|
2011-02-17 02:57:04 +08:00
|
|
|
{
|
2018-06-20 20:49:15 +08:00
|
|
|
u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
|
2016-06-23 06:54:24 +08:00
|
|
|
|
2019-06-19 04:09:17 +08:00
|
|
|
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
|
2011-02-17 02:57:04 +08:00
|
|
|
}
|
|
|
|
|
btrfs: fix wrong free space information of btrfs
When we store data by raid profile in btrfs with two or more different size
disks, df command shows there is some free space in the filesystem, but the
user can not write any data in fact, df command shows the wrong free space
information of btrfs.
# mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 28.00KB
devid 1 size 5.01GB used 2.03GB path /dev/sda9
devid 2 size 10.00GB used 2.01GB path /dev/sda10
# btrfs device scan /dev/sda9 /dev/sda10
# mount /dev/sda9 /mnt
# dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999
(fill the filesystem)
# sync
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 3.99GB
devid 1 size 5.01GB used 5.01GB path /dev/sda9
devid 2 size 10.00GB used 4.99GB path /dev/sda10
It is because btrfs cannot allocate chunks when one of the pairing disks has
no space, the free space on the other disks can not be used for ever, and should
be subtracted from the total space, but btrfs doesn't subtract this space from
the total. It is strange to the user.
This patch fixes it by calcing the free space that can be used to allocate
chunks.
Implementation:
1. get all the devices free space, and align them by stripe length.
2. sort the devices by the free space.
3. check the free space of the devices,
3.1. if it is not zero, and then check the number of the devices that has
more free space than this device,
if the number of the devices is beyond the min stripe number, the free
space can be used, and add into total free space.
if the number of the devices is below the min stripe number, we can not
use the free space, the check ends.
3.2. if the free space is zero, check the next devices, goto 3.1
This implementation is just likely fake chunk allocation.
After appling this patch, df can show correct space information:
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 0 100% /mnt
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 18:07:31 +08:00
|
|
|
/*
|
|
|
|
* helper to account the unused space of all the readonly block group in the
|
2014-10-31 21:49:34 +08:00
|
|
|
* space_info. takes mirrors into account.
|
btrfs: fix wrong free space information of btrfs
When we store data by raid profile in btrfs with two or more different size
disks, df command shows there is some free space in the filesystem, but the
user can not write any data in fact, df command shows the wrong free space
information of btrfs.
# mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 28.00KB
devid 1 size 5.01GB used 2.03GB path /dev/sda9
devid 2 size 10.00GB used 2.01GB path /dev/sda10
# btrfs device scan /dev/sda9 /dev/sda10
# mount /dev/sda9 /mnt
# dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999
(fill the filesystem)
# sync
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 3.99GB
devid 1 size 5.01GB used 5.01GB path /dev/sda9
devid 2 size 10.00GB used 4.99GB path /dev/sda10
It is because btrfs cannot allocate chunks when one of the pairing disks has
no space, the free space on the other disks can not be used for ever, and should
be subtracted from the total space, but btrfs doesn't subtract this space from
the total. It is strange to the user.
This patch fixes it by calcing the free space that can be used to allocate
chunks.
Implementation:
1. get all the devices free space, and align them by stripe length.
2. sort the devices by the free space.
3. check the free space of the devices,
3.1. if it is not zero, and then check the number of the devices that has
more free space than this device,
if the number of the devices is beyond the min stripe number, the free
space can be used, and add into total free space.
if the number of the devices is below the min stripe number, we can not
use the free space, the check ends.
3.2. if the free space is zero, check the next devices, goto 3.1
This implementation is just likely fake chunk allocation.
After appling this patch, df can show correct space information:
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 0 100% /mnt
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 18:07:31 +08:00
|
|
|
*/
|
2014-10-31 21:49:34 +08:00
|
|
|
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
|
btrfs: fix wrong free space information of btrfs
When we store data by raid profile in btrfs with two or more different size
disks, df command shows there is some free space in the filesystem, but the
user can not write any data in fact, df command shows the wrong free space
information of btrfs.
# mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 28.00KB
devid 1 size 5.01GB used 2.03GB path /dev/sda9
devid 2 size 10.00GB used 2.01GB path /dev/sda10
# btrfs device scan /dev/sda9 /dev/sda10
# mount /dev/sda9 /mnt
# dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999
(fill the filesystem)
# sync
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 3.99GB
devid 1 size 5.01GB used 5.01GB path /dev/sda9
devid 2 size 10.00GB used 4.99GB path /dev/sda10
It is because btrfs cannot allocate chunks when one of the pairing disks has
no space, the free space on the other disks can not be used for ever, and should
be subtracted from the total space, but btrfs doesn't subtract this space from
the total. It is strange to the user.
This patch fixes it by calcing the free space that can be used to allocate
chunks.
Implementation:
1. get all the devices free space, and align them by stripe length.
2. sort the devices by the free space.
3. check the free space of the devices,
3.1. if it is not zero, and then check the number of the devices that has
more free space than this device,
if the number of the devices is beyond the min stripe number, the free
space can be used, and add into total free space.
if the number of the devices is below the min stripe number, we can not
use the free space, the check ends.
3.2. if the free space is zero, check the next devices, goto 3.1
This implementation is just likely fake chunk allocation.
After appling this patch, df can show correct space information:
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 0 100% /mnt
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 18:07:31 +08:00
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
u64 free_bytes = 0;
|
|
|
|
int factor;
|
|
|
|
|
2016-05-20 09:18:45 +08:00
|
|
|
/* It's df, we don't care if it's racy */
|
2014-10-31 21:49:34 +08:00
|
|
|
if (list_empty(&sinfo->ro_bgs))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
spin_lock(&sinfo->lock);
|
|
|
|
list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
|
btrfs: fix wrong free space information of btrfs
When we store data by raid profile in btrfs with two or more different size
disks, df command shows there is some free space in the filesystem, but the
user can not write any data in fact, df command shows the wrong free space
information of btrfs.
# mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 28.00KB
devid 1 size 5.01GB used 2.03GB path /dev/sda9
devid 2 size 10.00GB used 2.01GB path /dev/sda10
# btrfs device scan /dev/sda9 /dev/sda10
# mount /dev/sda9 /mnt
# dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999
(fill the filesystem)
# sync
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 3.99GB
devid 1 size 5.01GB used 5.01GB path /dev/sda9
devid 2 size 10.00GB used 4.99GB path /dev/sda10
It is because btrfs cannot allocate chunks when one of the pairing disks has
no space, the free space on the other disks can not be used for ever, and should
be subtracted from the total space, but btrfs doesn't subtract this space from
the total. It is strange to the user.
This patch fixes it by calcing the free space that can be used to allocate
chunks.
Implementation:
1. get all the devices free space, and align them by stripe length.
2. sort the devices by the free space.
3. check the free space of the devices,
3.1. if it is not zero, and then check the number of the devices that has
more free space than this device,
if the number of the devices is beyond the min stripe number, the free
space can be used, and add into total free space.
if the number of the devices is below the min stripe number, we can not
use the free space, the check ends.
3.2. if the free space is zero, check the next devices, goto 3.1
This implementation is just likely fake chunk allocation.
After appling this patch, df can show correct space information:
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 0 100% /mnt
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 18:07:31 +08:00
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
|
|
|
|
if (!block_group->ro) {
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2018-07-14 02:46:30 +08:00
|
|
|
factor = btrfs_bg_type_to_factor(block_group->flags);
|
btrfs: fix wrong free space information of btrfs
When we store data by raid profile in btrfs with two or more different size
disks, df command shows there is some free space in the filesystem, but the
user can not write any data in fact, df command shows the wrong free space
information of btrfs.
# mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 28.00KB
devid 1 size 5.01GB used 2.03GB path /dev/sda9
devid 2 size 10.00GB used 2.01GB path /dev/sda10
# btrfs device scan /dev/sda9 /dev/sda10
# mount /dev/sda9 /mnt
# dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999
(fill the filesystem)
# sync
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt
# btrfs-show
Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64
Total devices 2 FS bytes used 3.99GB
devid 1 size 5.01GB used 5.01GB path /dev/sda9
devid 2 size 10.00GB used 4.99GB path /dev/sda10
It is because btrfs cannot allocate chunks when one of the pairing disks has
no space, the free space on the other disks can not be used for ever, and should
be subtracted from the total space, but btrfs doesn't subtract this space from
the total. It is strange to the user.
This patch fixes it by calcing the free space that can be used to allocate
chunks.
Implementation:
1. get all the devices free space, and align them by stripe length.
2. sort the devices by the free space.
3. check the free space of the devices,
3.1. if it is not zero, and then check the number of the devices that has
more free space than this device,
if the number of the devices is beyond the min stripe number, the free
space can be used, and add into total free space.
if the number of the devices is below the min stripe number, we can not
use the free space, the check ends.
3.2. if the free space is zero, check the next devices, goto 3.1
This implementation is just likely fake chunk allocation.
After appling this patch, df can show correct space information:
# df -TH
Filesystem Type Size Used Avail Use% Mounted on
/dev/sda9 btrfs 17G 8.6G 0 100% /mnt
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-01-05 18:07:31 +08:00
|
|
|
free_bytes += (block_group->key.offset -
|
|
|
|
btrfs_block_group_used(&block_group->item)) *
|
|
|
|
factor;
|
|
|
|
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&sinfo->lock);
|
|
|
|
|
|
|
|
return free_bytes;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
{
|
2010-05-16 22:46:25 +08:00
|
|
|
struct btrfs_space_info *sinfo = cache->space_info;
|
|
|
|
u64 num_bytes;
|
|
|
|
|
|
|
|
BUG_ON(!cache->ro);
|
|
|
|
|
|
|
|
spin_lock(&sinfo->lock);
|
|
|
|
spin_lock(&cache->lock);
|
2015-08-05 16:43:27 +08:00
|
|
|
if (!--cache->ro) {
|
|
|
|
num_bytes = cache->key.offset - cache->reserved -
|
|
|
|
cache->pinned - cache->bytes_super -
|
|
|
|
btrfs_block_group_used(&cache->item);
|
|
|
|
sinfo->bytes_readonly -= num_bytes;
|
|
|
|
list_del_init(&cache->ro_list);
|
|
|
|
}
|
2010-05-16 22:46:25 +08:00
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
spin_unlock(&sinfo->lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
/*
|
2018-11-28 19:05:13 +08:00
|
|
|
* Checks to see if it's even possible to relocate this block group.
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
*
|
|
|
|
* @return - -1 if it's not a good idea to relocate this block group, 0 if its
|
|
|
|
* ok to go ahead and try.
|
|
|
|
*/
|
2016-06-22 09:16:51 +08:00
|
|
|
int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
{
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
struct btrfs_space_info *space_info;
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_device *device;
|
2011-08-03 18:15:25 +08:00
|
|
|
u64 min_free;
|
2011-08-20 20:29:51 +08:00
|
|
|
u64 dev_min = 1;
|
|
|
|
u64 dev_nr = 0;
|
2012-03-27 22:09:17 +08:00
|
|
|
u64 target;
|
2016-03-23 11:38:17 +08:00
|
|
|
int debug;
|
2011-08-03 18:15:25 +08:00
|
|
|
int index;
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
int full = 0;
|
|
|
|
int ret = 0;
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
|
2016-03-23 11:38:17 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
block_group = btrfs_lookup_block_group(fs_info, bytenr);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
/* odd, couldn't find the block group, leave it alone */
|
2016-03-23 11:38:17 +08:00
|
|
|
if (!block_group) {
|
|
|
|
if (debug)
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_warn(fs_info,
|
2016-03-23 11:38:17 +08:00
|
|
|
"can't find block group for bytenr %llu",
|
|
|
|
bytenr);
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
return -1;
|
2016-03-23 11:38:17 +08:00
|
|
|
}
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2011-08-03 18:15:25 +08:00
|
|
|
min_free = btrfs_block_group_used(&block_group->item);
|
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
/* no bytes used, we're good */
|
2011-08-03 18:15:25 +08:00
|
|
|
if (!min_free)
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
goto out;
|
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
space_info = block_group->space_info;
|
|
|
|
spin_lock(&space_info->lock);
|
2008-12-12 23:03:38 +08:00
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
full = space_info->full;
|
2008-12-12 23:03:38 +08:00
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
/*
|
|
|
|
* if this is the last block group we have in this space, we can't
|
2009-09-23 02:48:44 +08:00
|
|
|
* relocate it unless we're able to allocate a new chunk below.
|
|
|
|
*
|
|
|
|
* Otherwise, we need to make sure we have room in the space to handle
|
|
|
|
* all of the extents from this block group. If we can, we're good
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
*/
|
2009-09-23 02:48:44 +08:00
|
|
|
if ((space_info->total_bytes != block_group->key.offset) &&
|
2017-02-14 07:42:21 +08:00
|
|
|
(btrfs_space_info_used(space_info, false) + min_free <
|
|
|
|
space_info->total_bytes)) {
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
goto out;
|
2008-12-12 23:03:38 +08:00
|
|
|
}
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
spin_unlock(&space_info->lock);
|
2008-08-05 11:17:27 +08:00
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
/*
|
|
|
|
* ok we don't have enough space, but maybe we have free space on our
|
|
|
|
* devices to allocate new chunks for relocation, so loop through our
|
2012-03-27 22:09:17 +08:00
|
|
|
* alloc devices and guess if we have enough space. if this block
|
|
|
|
* group is going to be restriped, run checks against the target
|
|
|
|
* profile instead of the current one.
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
*/
|
|
|
|
ret = -1;
|
2008-08-05 11:17:27 +08:00
|
|
|
|
2011-08-03 18:15:25 +08:00
|
|
|
/*
|
|
|
|
* index:
|
|
|
|
* 0: raid10
|
|
|
|
* 1: raid1
|
|
|
|
* 2: dup
|
|
|
|
* 3: raid0
|
|
|
|
* 4: single
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
target = get_restripe_target(fs_info, block_group->flags);
|
2012-03-27 22:09:17 +08:00
|
|
|
if (target) {
|
2018-01-30 18:20:45 +08:00
|
|
|
index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
|
2012-03-27 22:09:17 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* this is just a balance, so if we were marked as full
|
|
|
|
* we know there is no space for a new chunk
|
|
|
|
*/
|
2016-03-23 11:38:17 +08:00
|
|
|
if (full) {
|
|
|
|
if (debug)
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"no space to alloc new chunk for block group %llu",
|
|
|
|
block_group->key.objectid);
|
2012-03-27 22:09:17 +08:00
|
|
|
goto out;
|
2016-03-23 11:38:17 +08:00
|
|
|
}
|
2012-03-27 22:09:17 +08:00
|
|
|
|
2018-01-30 18:20:45 +08:00
|
|
|
index = btrfs_bg_flags_to_raid_index(block_group->flags);
|
2012-03-27 22:09:17 +08:00
|
|
|
}
|
|
|
|
|
2013-01-17 13:38:51 +08:00
|
|
|
if (index == BTRFS_RAID_RAID10) {
|
2011-08-03 18:15:25 +08:00
|
|
|
dev_min = 4;
|
2011-08-20 20:29:51 +08:00
|
|
|
/* Divide by 2 */
|
|
|
|
min_free >>= 1;
|
2013-01-17 13:38:51 +08:00
|
|
|
} else if (index == BTRFS_RAID_RAID1) {
|
2011-08-03 18:15:25 +08:00
|
|
|
dev_min = 2;
|
2013-01-17 13:38:51 +08:00
|
|
|
} else if (index == BTRFS_RAID_DUP) {
|
2011-08-20 20:29:51 +08:00
|
|
|
/* Multiply by 2 */
|
|
|
|
min_free <<= 1;
|
2013-01-17 13:38:51 +08:00
|
|
|
} else if (index == BTRFS_RAID_RAID0) {
|
2011-08-03 18:15:25 +08:00
|
|
|
dev_min = fs_devices->rw_devices;
|
2015-02-21 01:43:47 +08:00
|
|
|
min_free = div64_u64(min_free, dev_min);
|
2011-08-03 18:15:25 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
|
2011-01-05 18:07:26 +08:00
|
|
|
u64 dev_offset;
|
2009-03-13 22:10:06 +08:00
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
/*
|
|
|
|
* check to make sure we can actually find a chunk with enough
|
|
|
|
* space to fit our block group in.
|
|
|
|
*/
|
2012-11-06 01:29:28 +08:00
|
|
|
if (device->total_bytes > device->bytes_used + min_free &&
|
2017-12-04 12:54:55 +08:00
|
|
|
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
|
2019-03-27 20:24:14 +08:00
|
|
|
ret = find_free_dev_extent(device, min_free,
|
2011-01-05 18:07:26 +08:00
|
|
|
&dev_offset, NULL);
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
if (!ret)
|
2011-08-03 18:15:25 +08:00
|
|
|
dev_nr++;
|
|
|
|
|
|
|
|
if (dev_nr >= dev_min)
|
2008-01-04 03:14:39 +08:00
|
|
|
break;
|
2011-08-03 18:15:25 +08:00
|
|
|
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
ret = -1;
|
2008-01-05 05:47:16 +08:00
|
|
|
}
|
2007-12-22 05:27:24 +08:00
|
|
|
}
|
2016-03-23 11:38:17 +08:00
|
|
|
if (debug && ret == -1)
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"no space to allocate a new chunk for block group %llu",
|
|
|
|
block_group->key.objectid);
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2007-12-22 05:27:24 +08:00
|
|
|
out:
|
Btrfs: make balance code choose more wisely when relocating
Currently, we can panic the box if the first block group we go to move is of a
type where there is no space left to move those extents. For example, if we
fill the disk up with data, and then we try to balance and we have no room to
move the data nor room to allocate new chunks, we will panic. Change this by
checking to see if we have room to move this chunk around, and if not, return
-ENOSPC and move on to the next chunk. This will make sure we remove block
groups that are moveable, like if we have alot of empty metadata block groups,
and then that way we make room to be able to balance our data chunks as well.
Tested this with an fs that would panic on btrfs-vol -b normally, but no longer
panics with this patch.
V1->V2:
-actually search for a free extent on the device to make sure we can allocate a
chunk if need be.
-fix btrfs_shrink_device to make sure we actually try to relocate all the
chunks, and then if we can't return -ENOSPC so if we are doing a btrfs-vol -r
we don't remove the device with data still on it.
-check to make sure the block group we are going to relocate isn't the last one
in that particular space
-fix a bug in btrfs_shrink_device where we would change the device's size and
not fix it if we fail to do our relocate
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:11:19 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
2007-12-22 05:27:24 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-22 09:16:51 +08:00
|
|
|
static int find_first_block_group(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *key)
|
2008-03-25 03:01:56 +08:00
|
|
|
{
|
2016-06-22 09:16:51 +08:00
|
|
|
struct btrfs_root *root = fs_info->extent_root;
|
2008-06-26 04:01:30 +08:00
|
|
|
int ret = 0;
|
2008-03-25 03:01:56 +08:00
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct extent_buffer *leaf;
|
2018-08-01 10:37:16 +08:00
|
|
|
struct btrfs_block_group_item bg;
|
|
|
|
u64 flags;
|
2008-03-25 03:01:56 +08:00
|
|
|
int slot;
|
2007-12-22 05:27:24 +08:00
|
|
|
|
2008-03-25 03:01:56 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
2008-06-26 04:01:30 +08:00
|
|
|
goto out;
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2008-03-25 03:01:56 +08:00
|
|
|
slot = path->slots[0];
|
2007-12-22 05:27:24 +08:00
|
|
|
leaf = path->nodes[0];
|
2008-03-25 03:01:56 +08:00
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret == 0)
|
|
|
|
continue;
|
|
|
|
if (ret < 0)
|
2008-06-26 04:01:30 +08:00
|
|
|
goto out;
|
2008-03-25 03:01:56 +08:00
|
|
|
break;
|
2007-12-22 05:27:24 +08:00
|
|
|
}
|
2008-03-25 03:01:56 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
2007-12-22 05:27:24 +08:00
|
|
|
|
2008-03-25 03:01:56 +08:00
|
|
|
if (found_key.objectid >= key->objectid &&
|
2008-06-26 04:01:30 +08:00
|
|
|
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
|
2016-06-23 09:31:27 +08:00
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
|
2019-05-17 17:43:17 +08:00
|
|
|
em_tree = &root->fs_info->mapping_tree;
|
2016-06-23 09:31:27 +08:00
|
|
|
read_lock(&em_tree->lock);
|
|
|
|
em = lookup_extent_mapping(em_tree, found_key.objectid,
|
|
|
|
found_key.offset);
|
|
|
|
read_unlock(&em_tree->lock);
|
|
|
|
if (!em) {
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_err(fs_info,
|
2016-06-23 09:31:27 +08:00
|
|
|
"logical %llu len %llu found bg but no related chunk",
|
|
|
|
found_key.objectid, found_key.offset);
|
|
|
|
ret = -ENOENT;
|
2018-08-01 10:37:16 +08:00
|
|
|
} else if (em->start != found_key.objectid ||
|
|
|
|
em->len != found_key.offset) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"block group %llu len %llu mismatch with chunk %llu len %llu",
|
|
|
|
found_key.objectid, found_key.offset,
|
|
|
|
em->start, em->len);
|
|
|
|
ret = -EUCLEAN;
|
2016-06-23 09:31:27 +08:00
|
|
|
} else {
|
2018-08-01 10:37:16 +08:00
|
|
|
read_extent_buffer(leaf, &bg,
|
|
|
|
btrfs_item_ptr_offset(leaf, slot),
|
|
|
|
sizeof(bg));
|
|
|
|
flags = btrfs_block_group_flags(&bg) &
|
|
|
|
BTRFS_BLOCK_GROUP_TYPE_MASK;
|
|
|
|
|
|
|
|
if (flags != (em->map_lookup->type &
|
|
|
|
BTRFS_BLOCK_GROUP_TYPE_MASK)) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
|
|
|
|
found_key.objectid,
|
|
|
|
found_key.offset, flags,
|
|
|
|
(BTRFS_BLOCK_GROUP_TYPE_MASK &
|
|
|
|
em->map_lookup->type));
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
} else {
|
|
|
|
ret = 0;
|
|
|
|
}
|
2016-06-23 09:31:27 +08:00
|
|
|
}
|
2016-08-19 03:30:06 +08:00
|
|
|
free_extent_map(em);
|
2008-06-26 04:01:30 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2008-03-25 03:01:56 +08:00
|
|
|
path->slots[0]++;
|
2007-12-22 05:27:24 +08:00
|
|
|
}
|
2008-06-26 04:01:30 +08:00
|
|
|
out:
|
2008-03-25 03:01:56 +08:00
|
|
|
return ret;
|
2007-12-22 05:27:24 +08:00
|
|
|
}
|
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
u64 last = 0;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
block_group = btrfs_lookup_first_block_group(info, last);
|
|
|
|
while (block_group) {
|
2018-09-12 22:45:45 +08:00
|
|
|
wait_block_group_cache_done(block_group);
|
2010-06-22 02:48:16 +08:00
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
if (block_group->iref)
|
|
|
|
break;
|
|
|
|
spin_unlock(&block_group->lock);
|
2019-03-20 19:01:07 +08:00
|
|
|
block_group = next_block_group(block_group);
|
2010-06-22 02:48:16 +08:00
|
|
|
}
|
|
|
|
if (!block_group) {
|
|
|
|
if (last == 0)
|
|
|
|
break;
|
|
|
|
last = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = block_group->inode;
|
|
|
|
block_group->iref = 0;
|
|
|
|
block_group->inode = NULL;
|
|
|
|
spin_unlock(&block_group->lock);
|
2016-07-21 08:33:44 +08:00
|
|
|
ASSERT(block_group->io_ctl.inode == NULL);
|
2010-06-22 02:48:16 +08:00
|
|
|
iput(inode);
|
|
|
|
last = block_group->key.objectid + block_group->key.offset;
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-02 06:39:50 +08:00
|
|
|
/*
|
|
|
|
* Must be called only after stopping all workers, since we could have block
|
|
|
|
* group caching kthreads running, and therefore they could race with us if we
|
|
|
|
* freed the block groups before stopping them.
|
|
|
|
*/
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
int btrfs_free_block_groups(struct btrfs_fs_info *info)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
2009-03-11 00:39:20 +08:00
|
|
|
struct btrfs_space_info *space_info;
|
2009-09-12 04:11:19 +08:00
|
|
|
struct btrfs_caching_control *caching_ctl;
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
struct rb_node *n;
|
|
|
|
|
2014-03-14 03:42:13 +08:00
|
|
|
down_write(&info->commit_root_sem);
|
2009-09-12 04:11:19 +08:00
|
|
|
while (!list_empty(&info->caching_block_groups)) {
|
|
|
|
caching_ctl = list_entry(info->caching_block_groups.next,
|
|
|
|
struct btrfs_caching_control, list);
|
|
|
|
list_del(&caching_ctl->list);
|
|
|
|
put_caching_control(caching_ctl);
|
|
|
|
}
|
2014-03-14 03:42:13 +08:00
|
|
|
up_write(&info->commit_root_sem);
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2014-09-18 23:20:02 +08:00
|
|
|
spin_lock(&info->unused_bgs_lock);
|
|
|
|
while (!list_empty(&info->unused_bgs)) {
|
|
|
|
block_group = list_first_entry(&info->unused_bgs,
|
|
|
|
struct btrfs_block_group_cache,
|
|
|
|
bg_list);
|
|
|
|
list_del_init(&block_group->bg_list);
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
}
|
|
|
|
spin_unlock(&info->unused_bgs_lock);
|
|
|
|
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
spin_lock(&info->block_group_cache_lock);
|
|
|
|
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
|
|
|
|
block_group = rb_entry(n, struct btrfs_block_group_cache,
|
|
|
|
cache_node);
|
|
|
|
rb_erase(&block_group->cache_node,
|
|
|
|
&info->block_group_cache_tree);
|
2014-12-05 02:38:30 +08:00
|
|
|
RB_CLEAR_NODE(&block_group->cache_node);
|
2008-10-31 02:25:28 +08:00
|
|
|
spin_unlock(&info->block_group_cache_lock);
|
|
|
|
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
down_write(&block_group->space_info->groups_sem);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
list_del(&block_group->list);
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
up_write(&block_group->space_info->groups_sem);
|
2008-12-12 05:30:39 +08:00
|
|
|
|
2011-02-02 23:53:47 +08:00
|
|
|
/*
|
|
|
|
* We haven't cached this block group, which means we could
|
|
|
|
* possibly have excluded extents on this block group.
|
|
|
|
*/
|
2013-08-05 23:15:21 +08:00
|
|
|
if (block_group->cached == BTRFS_CACHE_NO ||
|
|
|
|
block_group->cached == BTRFS_CACHE_ERROR)
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(block_group);
|
2011-02-02 23:53:47 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
btrfs_remove_free_space_cache(block_group);
|
2017-02-02 06:39:50 +08:00
|
|
|
ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
|
2016-07-21 08:33:44 +08:00
|
|
|
ASSERT(list_empty(&block_group->dirty_list));
|
|
|
|
ASSERT(list_empty(&block_group->io_list));
|
|
|
|
ASSERT(list_empty(&block_group->bg_list));
|
|
|
|
ASSERT(atomic_read(&block_group->count) == 1);
|
2009-11-14 04:12:59 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
2008-10-31 02:25:28 +08:00
|
|
|
|
|
|
|
spin_lock(&info->block_group_cache_lock);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
}
|
|
|
|
spin_unlock(&info->block_group_cache_lock);
|
2009-03-11 00:39:20 +08:00
|
|
|
|
|
|
|
/* now that all the block groups are freed, go through and
|
|
|
|
* free all the space_info structs. This is only called during
|
|
|
|
* the final stages of unmount, and so we know nobody is
|
|
|
|
* using them. We call synchronize_rcu() once before we start,
|
|
|
|
* just to be on the safe side.
|
|
|
|
*/
|
|
|
|
synchronize_rcu();
|
|
|
|
|
2019-06-20 01:47:23 +08:00
|
|
|
btrfs_release_global_block_rsv(info);
|
2010-05-16 22:49:58 +08:00
|
|
|
|
2013-10-31 13:03:04 +08:00
|
|
|
while (!list_empty(&info->space_info)) {
|
2013-11-02 01:07:04 +08:00
|
|
|
int i;
|
|
|
|
|
2009-03-11 00:39:20 +08:00
|
|
|
space_info = list_entry(info->space_info.next,
|
|
|
|
struct btrfs_space_info,
|
|
|
|
list);
|
2016-03-26 01:25:51 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not hide this behind enospc_debug, this is actually
|
|
|
|
* important and indicates a real bug if this happens.
|
|
|
|
*/
|
|
|
|
if (WARN_ON(space_info->bytes_pinned > 0 ||
|
2013-02-09 05:28:17 +08:00
|
|
|
space_info->bytes_reserved > 0 ||
|
2016-03-26 01:25:51 +08:00
|
|
|
space_info->bytes_may_use > 0))
|
2019-06-19 04:09:24 +08:00
|
|
|
btrfs_dump_space_info(info, space_info, 0, 0);
|
2009-03-11 00:39:20 +08:00
|
|
|
list_del(&space_info->list);
|
2013-11-02 01:07:04 +08:00
|
|
|
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
|
|
|
|
struct kobject *kobj;
|
2014-05-28 00:59:57 +08:00
|
|
|
kobj = space_info->block_group_kobjs[i];
|
|
|
|
space_info->block_group_kobjs[i] = NULL;
|
|
|
|
if (kobj) {
|
2013-11-02 01:07:04 +08:00
|
|
|
kobject_del(kobj);
|
|
|
|
kobject_put(kobj);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
kobject_del(&space_info->kobj);
|
|
|
|
kobject_put(&space_info->kobj);
|
2009-03-11 00:39:20 +08:00
|
|
|
}
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-08-21 17:43:50 +08:00
|
|
|
static void link_block_group(struct btrfs_block_group_cache *cache)
|
2010-05-16 22:46:24 +08:00
|
|
|
{
|
2017-08-21 17:43:50 +08:00
|
|
|
struct btrfs_space_info *space_info = cache->space_info;
|
2018-03-21 03:25:26 +08:00
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
2018-01-30 18:20:45 +08:00
|
|
|
int index = btrfs_bg_flags_to_raid_index(cache->flags);
|
2014-03-27 02:11:26 +08:00
|
|
|
bool first = false;
|
2010-05-16 22:46:24 +08:00
|
|
|
|
|
|
|
down_write(&space_info->groups_sem);
|
2014-03-27 02:11:26 +08:00
|
|
|
if (list_empty(&space_info->block_groups[index]))
|
|
|
|
first = true;
|
|
|
|
list_add_tail(&cache->list, &space_info->block_groups[index]);
|
|
|
|
up_write(&space_info->groups_sem);
|
|
|
|
|
|
|
|
if (first) {
|
Btrfs: fix sysfs warning and missing raid sysfs directories
In the 5.3 merge window, commit 7c7e301406d0a9 ("btrfs: sysfs: Replace
default_attrs in ktypes with groups"), we started using the member
"defaults_groups" for the kobject type "btrfs_raid_ktype". That leads
to a series of warnings when running some test cases of fstests, such
as btrfs/027, btrfs/124 and btrfs/176. The traces produced by those
warnings are like the following:
[116648.059212] kernfs: can not remove 'total_bytes', no directory
[116648.060112] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.066482] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.069376] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.072385] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.073437] RAX: 0000000000000000 RBX: ffffffffc0c11998 RCX: 0000000000000000
[116648.074201] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.074956] RBP: ffffffffc0b9ca2f R08: 0000000000000000 R09: 0000000000000001
[116648.075708] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.076434] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.077143] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.077852] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.078546] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.079235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.079907] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.080585] Call Trace:
[116648.081262] remove_files+0x31/0x70
[116648.081929] sysfs_remove_group+0x38/0x80
[116648.082596] sysfs_remove_groups+0x34/0x70
[116648.083258] kobject_del+0x20/0x60
[116648.083933] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.084608] close_ctree+0x19a/0x380 [btrfs]
[116648.085278] generic_shutdown_super+0x6c/0x110
[116648.085951] kill_anon_super+0xe/0x30
[116648.086621] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.087289] deactivate_locked_super+0x3a/0x70
[116648.087956] cleanup_mnt+0xb4/0x160
[116648.088620] task_work_run+0x7e/0xc0
[116648.089285] exit_to_usermode_loop+0xfa/0x100
[116648.089933] do_syscall_64+0x1cb/0x220
[116648.090567] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.091197] RIP: 0033:0x7f9cdc073b37
(...)
[116648.100046] ---[ end trace 22e24db328ccadf8 ]---
[116648.100618] ------------[ cut here ]------------
[116648.101175] kernfs: can not remove 'used_bytes', no directory
[116648.101731] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.105649] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.107461] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.109336] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.109979] RAX: 0000000000000000 RBX: ffffffffc0c119a0 RCX: 0000000000000000
[116648.110625] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.111283] RBP: ffffffffc0b9ca41 R08: 0000000000000000 R09: 0000000000000001
[116648.111940] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.112603] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.113268] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.113939] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.114607] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.115286] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.115966] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.116649] Call Trace:
[116648.117326] remove_files+0x31/0x70
[116648.117997] sysfs_remove_group+0x38/0x80
[116648.118671] sysfs_remove_groups+0x34/0x70
[116648.119342] kobject_del+0x20/0x60
[116648.120022] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.120707] close_ctree+0x19a/0x380 [btrfs]
[116648.121396] generic_shutdown_super+0x6c/0x110
[116648.122057] kill_anon_super+0xe/0x30
[116648.122702] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.123335] deactivate_locked_super+0x3a/0x70
[116648.123961] cleanup_mnt+0xb4/0x160
[116648.124586] task_work_run+0x7e/0xc0
[116648.125210] exit_to_usermode_loop+0xfa/0x100
[116648.125830] do_syscall_64+0x1cb/0x220
[116648.126463] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.127080] RIP: 0033:0x7f9cdc073b37
(...)
[116648.135923] ---[ end trace 22e24db328ccadf9 ]---
These happen because, during the unmount path, we call kobject_del() for
raid kobjects that are not fully initialized, meaning that we set their
ktype (as btrfs_raid_ktype) through link_block_group() but we didn't set
their parent kobject, which is done through btrfs_add_raid_kobjects().
We have this split raid kobject setup since commit 75cb379d263521
("btrfs: defer adding raid type kobject until after chunk relocation") in
order to avoid triggering reclaim during contextes where we can not
(either we are holding a transaction handle or some lock required by
the transaction commit path), so that we do the calls to kobject_add(),
which triggers GFP_KERNEL allocations, through btrfs_add_raid_kobjects()
in contextes where it is safe to trigger reclaim. That change expected
that a new raid kobject can only be created either when mounting the
filesystem or after raid profile conversion through the relocation path.
However, we can have new raid kobject created in other two cases at least:
1) During device replace (or scrub) after adding a device a to the
filesystem. The replace procedure (and scrub) do calls to
btrfs_inc_block_group_ro() which can allocate a new block group
with a new raid profile (because we now have more devices). This
can be triggered by test cases btrfs/027 and btrfs/176.
2) During a degraded mount trough any write path. This can be triggered
by test case btrfs/124.
Fixing this by adding extra calls to btrfs_add_raid_kobjects(), not only
makes things more complex and fragile, can also introduce deadlocks with
reclaim the following way:
1) Calling btrfs_add_raid_kobjects() at btrfs_inc_block_group_ro() or
anywhere in the replace/scrub path will cause a deadlock with reclaim
because if reclaim happens and a transaction commit is triggered,
the transaction commit path will block at btrfs_scrub_pause().
2) During degraded mounts it is essentially impossible to figure out where
to add extra calls to btrfs_add_raid_kobjects(), because allocation of
a block group with a new raid profile can happen anywhere, which means
we can't safely figure out which contextes are safe for reclaim, as
we can either hold a transaction handle or some lock needed by the
transaction commit path.
So it is too complex and error prone to have this split setup of raid
kobjects. So fix the issue by consolidating the setup of the kobjects in a
single place, at link_block_group(), and setup a nofs context there in
order to prevent reclaim being triggered by the memory allocations done
through the call chain of kobject_add().
Besides fixing the sysfs warnings during kobject_del(), this also ensures
the sysfs directories for the new raid profiles end up created and visible
to users (a bug that existed before the 5.3 commit 7c7e301406d0a9
("btrfs: sysfs: Replace default_attrs in ktypes with groups")).
Fixes: 75cb379d263521 ("btrfs: defer adding raid type kobject until after chunk relocation")
Fixes: 7c7e301406d0a9 ("btrfs: sysfs: Replace default_attrs in ktypes with groups")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-07 18:21:46 +08:00
|
|
|
struct raid_kobject *rkobj;
|
|
|
|
unsigned int nofs_flag;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Setup a NOFS context because kobject_add(), deep in its call
|
|
|
|
* chain, does GFP_KERNEL allocations, and we are often called
|
|
|
|
* in a context where if reclaim is triggered we can deadlock
|
|
|
|
* (we are either holding a transaction handle or some lock
|
|
|
|
* required for a transaction commit).
|
|
|
|
*/
|
|
|
|
nofs_flag = memalloc_nofs_save();
|
|
|
|
rkobj = kzalloc(sizeof(*rkobj), GFP_KERNEL);
|
2018-03-21 03:25:26 +08:00
|
|
|
if (!rkobj) {
|
Btrfs: fix sysfs warning and missing raid sysfs directories
In the 5.3 merge window, commit 7c7e301406d0a9 ("btrfs: sysfs: Replace
default_attrs in ktypes with groups"), we started using the member
"defaults_groups" for the kobject type "btrfs_raid_ktype". That leads
to a series of warnings when running some test cases of fstests, such
as btrfs/027, btrfs/124 and btrfs/176. The traces produced by those
warnings are like the following:
[116648.059212] kernfs: can not remove 'total_bytes', no directory
[116648.060112] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.066482] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.069376] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.072385] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.073437] RAX: 0000000000000000 RBX: ffffffffc0c11998 RCX: 0000000000000000
[116648.074201] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.074956] RBP: ffffffffc0b9ca2f R08: 0000000000000000 R09: 0000000000000001
[116648.075708] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.076434] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.077143] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.077852] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.078546] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.079235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.079907] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.080585] Call Trace:
[116648.081262] remove_files+0x31/0x70
[116648.081929] sysfs_remove_group+0x38/0x80
[116648.082596] sysfs_remove_groups+0x34/0x70
[116648.083258] kobject_del+0x20/0x60
[116648.083933] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.084608] close_ctree+0x19a/0x380 [btrfs]
[116648.085278] generic_shutdown_super+0x6c/0x110
[116648.085951] kill_anon_super+0xe/0x30
[116648.086621] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.087289] deactivate_locked_super+0x3a/0x70
[116648.087956] cleanup_mnt+0xb4/0x160
[116648.088620] task_work_run+0x7e/0xc0
[116648.089285] exit_to_usermode_loop+0xfa/0x100
[116648.089933] do_syscall_64+0x1cb/0x220
[116648.090567] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.091197] RIP: 0033:0x7f9cdc073b37
(...)
[116648.100046] ---[ end trace 22e24db328ccadf8 ]---
[116648.100618] ------------[ cut here ]------------
[116648.101175] kernfs: can not remove 'used_bytes', no directory
[116648.101731] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.105649] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.107461] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.109336] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.109979] RAX: 0000000000000000 RBX: ffffffffc0c119a0 RCX: 0000000000000000
[116648.110625] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.111283] RBP: ffffffffc0b9ca41 R08: 0000000000000000 R09: 0000000000000001
[116648.111940] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.112603] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.113268] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.113939] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.114607] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.115286] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.115966] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.116649] Call Trace:
[116648.117326] remove_files+0x31/0x70
[116648.117997] sysfs_remove_group+0x38/0x80
[116648.118671] sysfs_remove_groups+0x34/0x70
[116648.119342] kobject_del+0x20/0x60
[116648.120022] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.120707] close_ctree+0x19a/0x380 [btrfs]
[116648.121396] generic_shutdown_super+0x6c/0x110
[116648.122057] kill_anon_super+0xe/0x30
[116648.122702] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.123335] deactivate_locked_super+0x3a/0x70
[116648.123961] cleanup_mnt+0xb4/0x160
[116648.124586] task_work_run+0x7e/0xc0
[116648.125210] exit_to_usermode_loop+0xfa/0x100
[116648.125830] do_syscall_64+0x1cb/0x220
[116648.126463] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.127080] RIP: 0033:0x7f9cdc073b37
(...)
[116648.135923] ---[ end trace 22e24db328ccadf9 ]---
These happen because, during the unmount path, we call kobject_del() for
raid kobjects that are not fully initialized, meaning that we set their
ktype (as btrfs_raid_ktype) through link_block_group() but we didn't set
their parent kobject, which is done through btrfs_add_raid_kobjects().
We have this split raid kobject setup since commit 75cb379d263521
("btrfs: defer adding raid type kobject until after chunk relocation") in
order to avoid triggering reclaim during contextes where we can not
(either we are holding a transaction handle or some lock required by
the transaction commit path), so that we do the calls to kobject_add(),
which triggers GFP_KERNEL allocations, through btrfs_add_raid_kobjects()
in contextes where it is safe to trigger reclaim. That change expected
that a new raid kobject can only be created either when mounting the
filesystem or after raid profile conversion through the relocation path.
However, we can have new raid kobject created in other two cases at least:
1) During device replace (or scrub) after adding a device a to the
filesystem. The replace procedure (and scrub) do calls to
btrfs_inc_block_group_ro() which can allocate a new block group
with a new raid profile (because we now have more devices). This
can be triggered by test cases btrfs/027 and btrfs/176.
2) During a degraded mount trough any write path. This can be triggered
by test case btrfs/124.
Fixing this by adding extra calls to btrfs_add_raid_kobjects(), not only
makes things more complex and fragile, can also introduce deadlocks with
reclaim the following way:
1) Calling btrfs_add_raid_kobjects() at btrfs_inc_block_group_ro() or
anywhere in the replace/scrub path will cause a deadlock with reclaim
because if reclaim happens and a transaction commit is triggered,
the transaction commit path will block at btrfs_scrub_pause().
2) During degraded mounts it is essentially impossible to figure out where
to add extra calls to btrfs_add_raid_kobjects(), because allocation of
a block group with a new raid profile can happen anywhere, which means
we can't safely figure out which contextes are safe for reclaim, as
we can either hold a transaction handle or some lock needed by the
transaction commit path.
So it is too complex and error prone to have this split setup of raid
kobjects. So fix the issue by consolidating the setup of the kobjects in a
single place, at link_block_group(), and setup a nofs context there in
order to prevent reclaim being triggered by the memory allocations done
through the call chain of kobject_add().
Besides fixing the sysfs warnings during kobject_del(), this also ensures
the sysfs directories for the new raid profiles end up created and visible
to users (a bug that existed before the 5.3 commit 7c7e301406d0a9
("btrfs: sysfs: Replace default_attrs in ktypes with groups")).
Fixes: 75cb379d263521 ("btrfs: defer adding raid type kobject until after chunk relocation")
Fixes: 7c7e301406d0a9 ("btrfs: sysfs: Replace default_attrs in ktypes with groups")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-07 18:21:46 +08:00
|
|
|
memalloc_nofs_restore(nofs_flag);
|
2018-03-21 03:25:26 +08:00
|
|
|
btrfs_warn(cache->fs_info,
|
|
|
|
"couldn't alloc memory for raid level kobject");
|
|
|
|
return;
|
2013-11-02 01:07:04 +08:00
|
|
|
}
|
2018-03-21 03:25:26 +08:00
|
|
|
rkobj->flags = cache->flags;
|
|
|
|
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
|
Btrfs: fix sysfs warning and missing raid sysfs directories
In the 5.3 merge window, commit 7c7e301406d0a9 ("btrfs: sysfs: Replace
default_attrs in ktypes with groups"), we started using the member
"defaults_groups" for the kobject type "btrfs_raid_ktype". That leads
to a series of warnings when running some test cases of fstests, such
as btrfs/027, btrfs/124 and btrfs/176. The traces produced by those
warnings are like the following:
[116648.059212] kernfs: can not remove 'total_bytes', no directory
[116648.060112] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.066482] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.069376] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.072385] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.073437] RAX: 0000000000000000 RBX: ffffffffc0c11998 RCX: 0000000000000000
[116648.074201] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.074956] RBP: ffffffffc0b9ca2f R08: 0000000000000000 R09: 0000000000000001
[116648.075708] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.076434] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.077143] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.077852] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.078546] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.079235] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.079907] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.080585] Call Trace:
[116648.081262] remove_files+0x31/0x70
[116648.081929] sysfs_remove_group+0x38/0x80
[116648.082596] sysfs_remove_groups+0x34/0x70
[116648.083258] kobject_del+0x20/0x60
[116648.083933] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.084608] close_ctree+0x19a/0x380 [btrfs]
[116648.085278] generic_shutdown_super+0x6c/0x110
[116648.085951] kill_anon_super+0xe/0x30
[116648.086621] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.087289] deactivate_locked_super+0x3a/0x70
[116648.087956] cleanup_mnt+0xb4/0x160
[116648.088620] task_work_run+0x7e/0xc0
[116648.089285] exit_to_usermode_loop+0xfa/0x100
[116648.089933] do_syscall_64+0x1cb/0x220
[116648.090567] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.091197] RIP: 0033:0x7f9cdc073b37
(...)
[116648.100046] ---[ end trace 22e24db328ccadf8 ]---
[116648.100618] ------------[ cut here ]------------
[116648.101175] kernfs: can not remove 'used_bytes', no directory
[116648.101731] WARNING: CPU: 3 PID: 28500 at fs/kernfs/dir.c:1504 kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.105649] CPU: 3 PID: 28500 Comm: umount Tainted: G W 5.3.0-rc3-btrfs-next-54 #1
(...)
[116648.107461] RIP: 0010:kernfs_remove_by_name_ns+0x75/0x80
(...)
[116648.109336] RSP: 0018:ffffabfd0090bd08 EFLAGS: 00010282
[116648.109979] RAX: 0000000000000000 RBX: ffffffffc0c119a0 RCX: 0000000000000000
[116648.110625] RDX: ffff9fff603a7a00 RSI: ffff9fff603978a8 RDI: ffff9fff603978a8
[116648.111283] RBP: ffffffffc0b9ca41 R08: 0000000000000000 R09: 0000000000000001
[116648.111940] R10: ffff9ffe1f72e1c0 R11: 0000000000000000 R12: ffffffffc0b94120
[116648.112603] R13: ffffffffb3d9b4e0 R14: 0000000000000000 R15: dead000000000100
[116648.113268] FS: 00007f9cdc78a2c0(0000) GS:ffff9fff60380000(0000) knlGS:0000000000000000
[116648.113939] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[116648.114607] CR2: 00007f9fc4747ab4 CR3: 00000005c7832003 CR4: 00000000003606e0
[116648.115286] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[116648.115966] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[116648.116649] Call Trace:
[116648.117326] remove_files+0x31/0x70
[116648.117997] sysfs_remove_group+0x38/0x80
[116648.118671] sysfs_remove_groups+0x34/0x70
[116648.119342] kobject_del+0x20/0x60
[116648.120022] btrfs_free_block_groups+0x405/0x430 [btrfs]
[116648.120707] close_ctree+0x19a/0x380 [btrfs]
[116648.121396] generic_shutdown_super+0x6c/0x110
[116648.122057] kill_anon_super+0xe/0x30
[116648.122702] btrfs_kill_super+0x12/0xa0 [btrfs]
[116648.123335] deactivate_locked_super+0x3a/0x70
[116648.123961] cleanup_mnt+0xb4/0x160
[116648.124586] task_work_run+0x7e/0xc0
[116648.125210] exit_to_usermode_loop+0xfa/0x100
[116648.125830] do_syscall_64+0x1cb/0x220
[116648.126463] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[116648.127080] RIP: 0033:0x7f9cdc073b37
(...)
[116648.135923] ---[ end trace 22e24db328ccadf9 ]---
These happen because, during the unmount path, we call kobject_del() for
raid kobjects that are not fully initialized, meaning that we set their
ktype (as btrfs_raid_ktype) through link_block_group() but we didn't set
their parent kobject, which is done through btrfs_add_raid_kobjects().
We have this split raid kobject setup since commit 75cb379d263521
("btrfs: defer adding raid type kobject until after chunk relocation") in
order to avoid triggering reclaim during contextes where we can not
(either we are holding a transaction handle or some lock required by
the transaction commit path), so that we do the calls to kobject_add(),
which triggers GFP_KERNEL allocations, through btrfs_add_raid_kobjects()
in contextes where it is safe to trigger reclaim. That change expected
that a new raid kobject can only be created either when mounting the
filesystem or after raid profile conversion through the relocation path.
However, we can have new raid kobject created in other two cases at least:
1) During device replace (or scrub) after adding a device a to the
filesystem. The replace procedure (and scrub) do calls to
btrfs_inc_block_group_ro() which can allocate a new block group
with a new raid profile (because we now have more devices). This
can be triggered by test cases btrfs/027 and btrfs/176.
2) During a degraded mount trough any write path. This can be triggered
by test case btrfs/124.
Fixing this by adding extra calls to btrfs_add_raid_kobjects(), not only
makes things more complex and fragile, can also introduce deadlocks with
reclaim the following way:
1) Calling btrfs_add_raid_kobjects() at btrfs_inc_block_group_ro() or
anywhere in the replace/scrub path will cause a deadlock with reclaim
because if reclaim happens and a transaction commit is triggered,
the transaction commit path will block at btrfs_scrub_pause().
2) During degraded mounts it is essentially impossible to figure out where
to add extra calls to btrfs_add_raid_kobjects(), because allocation of
a block group with a new raid profile can happen anywhere, which means
we can't safely figure out which contextes are safe for reclaim, as
we can either hold a transaction handle or some lock needed by the
transaction commit path.
So it is too complex and error prone to have this split setup of raid
kobjects. So fix the issue by consolidating the setup of the kobjects in a
single place, at link_block_group(), and setup a nofs context there in
order to prevent reclaim being triggered by the memory allocations done
through the call chain of kobject_add().
Besides fixing the sysfs warnings during kobject_del(), this also ensures
the sysfs directories for the new raid profiles end up created and visible
to users (a bug that existed before the 5.3 commit 7c7e301406d0a9
("btrfs: sysfs: Replace default_attrs in ktypes with groups")).
Fixes: 75cb379d263521 ("btrfs: defer adding raid type kobject until after chunk relocation")
Fixes: 7c7e301406d0a9 ("btrfs: sysfs: Replace default_attrs in ktypes with groups")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-08-07 18:21:46 +08:00
|
|
|
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
|
|
|
|
btrfs_bg_type_to_raid_name(rkobj->flags));
|
|
|
|
memalloc_nofs_restore(nofs_flag);
|
|
|
|
if (ret) {
|
|
|
|
kobject_put(&rkobj->kobj);
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"failed to add kobject for block cache, ignoring");
|
|
|
|
return;
|
|
|
|
}
|
2014-05-28 00:59:57 +08:00
|
|
|
space_info->block_group_kobjs[index] = &rkobj->kobj;
|
2013-11-02 01:07:04 +08:00
|
|
|
}
|
2010-05-16 22:46:24 +08:00
|
|
|
}
|
|
|
|
|
2014-01-15 20:00:55 +08:00
|
|
|
static struct btrfs_block_group_cache *
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 start, u64 size)
|
2014-01-15 20:00:55 +08:00
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache;
|
|
|
|
|
|
|
|
cache = kzalloc(sizeof(*cache), GFP_NOFS);
|
|
|
|
if (!cache)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
|
|
|
|
GFP_NOFS);
|
|
|
|
if (!cache->free_space_ctl) {
|
|
|
|
kfree(cache);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
cache->key.objectid = start;
|
|
|
|
cache->key.offset = size;
|
|
|
|
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
cache->fs_info = fs_info;
|
2017-07-19 15:48:42 +08:00
|
|
|
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
|
2015-09-30 11:50:37 +08:00
|
|
|
set_free_space_tree_thresholds(cache);
|
|
|
|
|
2014-01-15 20:00:55 +08:00
|
|
|
atomic_set(&cache->count, 1);
|
|
|
|
spin_lock_init(&cache->lock);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
init_rwsem(&cache->data_rwsem);
|
2014-01-15 20:00:55 +08:00
|
|
|
INIT_LIST_HEAD(&cache->list);
|
|
|
|
INIT_LIST_HEAD(&cache->cluster_list);
|
2014-09-18 23:20:02 +08:00
|
|
|
INIT_LIST_HEAD(&cache->bg_list);
|
2014-10-31 21:49:34 +08:00
|
|
|
INIT_LIST_HEAD(&cache->ro_list);
|
2014-11-18 04:45:48 +08:00
|
|
|
INIT_LIST_HEAD(&cache->dirty_list);
|
2015-04-05 08:14:42 +08:00
|
|
|
INIT_LIST_HEAD(&cache->io_list);
|
2014-01-15 20:00:55 +08:00
|
|
|
btrfs_init_free_space_ctl(cache);
|
Btrfs: fix race between fs trimming and block group remove/allocation
Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard completes
it is added back to the free space rbtree.
If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.
So fix this by keeping the extent map in memory until the discard completes
so that the same physical addresses aren't reused before it completes.
If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens) we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:
checking extents
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
owner ref check failed [833912832 16384]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
root 5 root dir 256 error
root 5 inode 260 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 262 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 263 errors 2001, no inode item, link count wrong
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-28 05:14:15 +08:00
|
|
|
atomic_set(&cache->trimming, 0);
|
2015-09-30 11:50:35 +08:00
|
|
|
mutex_init(&cache->free_space_lock);
|
2017-04-14 08:35:54 +08:00
|
|
|
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
|
2014-01-15 20:00:55 +08:00
|
|
|
|
|
|
|
return cache;
|
|
|
|
}
|
|
|
|
|
2018-08-01 10:37:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate all chunks and verify that each of them has the corresponding block
|
|
|
|
* group
|
|
|
|
*/
|
|
|
|
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2019-05-17 17:43:17 +08:00
|
|
|
struct extent_map_tree *map_tree = &fs_info->mapping_tree;
|
2018-08-01 10:37:17 +08:00
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_block_group_cache *bg;
|
|
|
|
u64 start = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
while (1) {
|
2019-05-17 17:43:17 +08:00
|
|
|
read_lock(&map_tree->lock);
|
2018-08-01 10:37:17 +08:00
|
|
|
/*
|
|
|
|
* lookup_extent_mapping will return the first extent map
|
|
|
|
* intersecting the range, so setting @len to 1 is enough to
|
|
|
|
* get the first chunk.
|
|
|
|
*/
|
2019-05-17 17:43:17 +08:00
|
|
|
em = lookup_extent_mapping(map_tree, start, 1);
|
|
|
|
read_unlock(&map_tree->lock);
|
2018-08-01 10:37:17 +08:00
|
|
|
if (!em)
|
|
|
|
break;
|
|
|
|
|
|
|
|
bg = btrfs_lookup_block_group(fs_info, em->start);
|
|
|
|
if (!bg) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"chunk start=%llu len=%llu doesn't have corresponding block group",
|
|
|
|
em->start, em->len);
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (bg->key.objectid != em->start ||
|
|
|
|
bg->key.offset != em->len ||
|
|
|
|
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
|
|
|
|
(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
|
|
|
|
em->start, em->len,
|
|
|
|
em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
|
|
|
|
bg->key.objectid, bg->key.offset,
|
|
|
|
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
free_extent_map(em);
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
start = em->start + em->len;
|
|
|
|
free_extent_map(em);
|
|
|
|
btrfs_put_block_group(bg);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-21 22:40:19 +08:00
|
|
|
int btrfs_read_block_groups(struct btrfs_fs_info *info)
|
2007-04-27 04:46:15 +08:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
|
|
|
struct btrfs_block_group_cache *cache;
|
2008-03-25 03:01:59 +08:00
|
|
|
struct btrfs_space_info *space_info;
|
2007-04-27 04:46:15 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2010-06-22 02:48:16 +08:00
|
|
|
int need_clear = 0;
|
|
|
|
u64 cache_gen;
|
2016-08-26 09:08:27 +08:00
|
|
|
u64 feature;
|
|
|
|
int mixed;
|
|
|
|
|
|
|
|
feature = btrfs_super_incompat_flags(info->super_copy);
|
|
|
|
mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
|
2007-10-16 04:15:19 +08:00
|
|
|
|
2007-04-27 04:46:15 +08:00
|
|
|
key.objectid = 0;
|
2008-03-25 03:01:56 +08:00
|
|
|
key.offset = 0;
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
|
2007-04-27 04:46:15 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
2007-04-27 04:46:15 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
cache_gen = btrfs_super_cache_generation(info->super_copy);
|
|
|
|
if (btrfs_test_opt(info, SPACE_CACHE) &&
|
|
|
|
btrfs_super_generation(info->super_copy) != cache_gen)
|
2010-06-22 02:48:16 +08:00
|
|
|
need_clear = 1;
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_test_opt(info, CLEAR_CACHE))
|
2010-09-22 02:21:34 +08:00
|
|
|
need_clear = 1;
|
2010-06-22 02:48:16 +08:00
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2016-06-22 09:16:51 +08:00
|
|
|
ret = find_first_block_group(info, path, &key);
|
2010-05-16 22:46:24 +08:00
|
|
|
if (ret > 0)
|
|
|
|
break;
|
2008-03-25 03:01:56 +08:00
|
|
|
if (ret != 0)
|
|
|
|
goto error;
|
2014-01-15 20:00:55 +08:00
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
2014-01-15 20:00:55 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
cache = btrfs_create_block_group_cache(info, found_key.objectid,
|
2014-01-15 20:00:55 +08:00
|
|
|
found_key.offset);
|
2007-04-27 04:46:15 +08:00
|
|
|
if (!cache) {
|
2008-03-25 03:01:56 +08:00
|
|
|
ret = -ENOMEM;
|
2010-05-16 22:46:25 +08:00
|
|
|
goto error;
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
Btrfs: use hybrid extents+bitmap rb tree for free space
Currently btrfs has a problem where it can use a ridiculous amount of RAM simply
tracking free space. As free space gets fragmented, we end up with thousands of
entries on an rb-tree per block group, which usually spans 1 gig of area. Since
we currently don't ever flush free space cache back to disk this gets to be a
bit unweildly on large fs's with lots of fragmentation.
This patch solves this problem by using PAGE_SIZE bitmaps for parts of the free
space cache. Initially we calculate a threshold of extent entries we can
handle, which is however many extent entries we can cram into 16k of ram. The
maximum amount of RAM that should ever be used to track 1 gigabyte of diskspace
will be 32k of RAM, which scales much better than we did before.
Once we pass the extent threshold, we start adding bitmaps and using those
instead for tracking the free space. This patch also makes it so that any free
space thats less than 4 * sectorsize we go ahead and put into a bitmap. This is
nice since we try and allocate out of the front of a block group, so if the
front of a block group is heavily fragmented and then has a huge chunk of free
space at the end, we go ahead and add the fragmented areas to bitmaps and use a
normal extent entry to track the big chunk at the back of the block group.
I've also taken the opportunity to revamp how we search for free space.
Previously we indexed free space via an offset indexed rb tree and a bytes
indexed rb tree. I've dropped the bytes indexed rb tree and use only the offset
indexed rb tree. This cuts the number of tree operations we were doing
previously down by half, and gives us a little bit of a better allocation
pattern since we will always start from a specific offset and search forward
from there, instead of searching for the size we need and try and get it as
close as possible to the offset we want.
I've given this a healthy amount of testing pre-new format stuff, as well as
post-new format stuff. I've booted up my fedora box which is installed on btrfs
with this patch and ran with it for a few days without issues. I've not seen
any performance regressions in any of my tests.
Since the last patch Yan Zheng fixed a problem where we could have overlapping
entries, so updating their offset inline would cause problems. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
Btrfs: fix a bug of writting free space cache during balance
Here is the whole story:
1)
A free space cache consists of two parts:
o free space cache inode, which is special becase it's stored in root tree.
o free space info, which is stored as the above inode's file data.
But we only build up another new inode and does not flush its free space info
onto disk when we _clear and setup_ free space cache, and this ends up with
that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN.
And holding DC_SETUP means that we will not truncate this free space cache inode,
which means the disk offset of its file extent will remain _unchanged_ at least
until next transaction finishes committing itself.
2)
We can set a block group readonly when we relocate the block group.
However,
if the readonly block group covers the disk offset where our free space cache
inode is going to write, it will force the free space cache inode into
cow_file_range() and it'll end up hitting a BUG_ON.
3)
Due to the above analysis, we fix this bug by adding the missing dirty flag.
4)
However, it's not over, there is still another case, nospace_cache.
With nospace_cache, we do not want to set dirty flag, instead we just truncate
free space cache inode and bail out with setting cache state DC_WRITTEN.
We can benifit from it since it saves us another 'pre-allocation' part which
usually costs a lot.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-07-06 17:31:34 +08:00
|
|
|
if (need_clear) {
|
|
|
|
/*
|
|
|
|
* When we mount with old space cache, we need to
|
|
|
|
* set BTRFS_DC_CLEAR and set dirty flag.
|
|
|
|
*
|
|
|
|
* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
|
|
|
|
* truncate the old free space cache inode and
|
|
|
|
* setup a new one.
|
|
|
|
* b) Setting 'dirty flag' makes sure that we flush
|
|
|
|
* the new space cache info onto disk.
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_test_opt(info, SPACE_CACHE))
|
2014-11-18 04:45:48 +08:00
|
|
|
cache->disk_cache_state = BTRFS_DC_CLEAR;
|
Btrfs: fix a bug of writting free space cache during balance
Here is the whole story:
1)
A free space cache consists of two parts:
o free space cache inode, which is special becase it's stored in root tree.
o free space info, which is stored as the above inode's file data.
But we only build up another new inode and does not flush its free space info
onto disk when we _clear and setup_ free space cache, and this ends up with
that the block group cache's cache_state remains DC_SETUP instead of DC_WRITTEN.
And holding DC_SETUP means that we will not truncate this free space cache inode,
which means the disk offset of its file extent will remain _unchanged_ at least
until next transaction finishes committing itself.
2)
We can set a block group readonly when we relocate the block group.
However,
if the readonly block group covers the disk offset where our free space cache
inode is going to write, it will force the free space cache inode into
cow_file_range() and it'll end up hitting a BUG_ON.
3)
Due to the above analysis, we fix this bug by adding the missing dirty flag.
4)
However, it's not over, there is still another case, nospace_cache.
With nospace_cache, we do not want to set dirty flag, instead we just truncate
free space cache inode and bail out with setting cache state DC_WRITTEN.
We can benifit from it since it saves us another 'pre-allocation' part which
usually costs a lot.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-07-06 17:31:34 +08:00
|
|
|
}
|
2010-06-22 02:48:16 +08:00
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
read_extent_buffer(leaf, &cache->item,
|
|
|
|
btrfs_item_ptr_offset(leaf, path->slots[0]),
|
|
|
|
sizeof(cache->item));
|
2014-01-15 20:00:55 +08:00
|
|
|
cache->flags = btrfs_block_group_flags(&cache->item);
|
2016-08-26 09:08:27 +08:00
|
|
|
if (!mixed &&
|
|
|
|
((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
|
|
|
|
(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
|
|
|
|
btrfs_err(info,
|
|
|
|
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
|
|
|
|
cache->key.objectid);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
2008-03-25 03:01:56 +08:00
|
|
|
|
2007-04-27 04:46:15 +08:00
|
|
|
key.objectid = found_key.objectid + found_key.offset;
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2011-03-29 13:46:06 +08:00
|
|
|
|
2011-02-02 23:53:47 +08:00
|
|
|
/*
|
|
|
|
* We need to exclude the super stripes now so that the space
|
|
|
|
* info has super bytes accounted for, otherwise we'll think
|
|
|
|
* we have more space than we actually do.
|
|
|
|
*/
|
2018-06-20 20:49:09 +08:00
|
|
|
ret = exclude_super_stripes(cache);
|
2013-03-20 00:13:25 +08:00
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* We may have excluded something, so call this just in
|
|
|
|
* case.
|
|
|
|
*/
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(cache);
|
2014-01-15 20:00:55 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2013-03-20 00:13:25 +08:00
|
|
|
goto error;
|
|
|
|
}
|
2011-02-02 23:53:47 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
/*
|
|
|
|
* check for two cases, either we are full, and therefore
|
|
|
|
* don't need to bother with the caching work since we won't
|
|
|
|
* find any space, or we are empty, and we can just add all
|
2018-11-28 19:05:13 +08:00
|
|
|
* the space in and be done with it. This saves us _a_lot_ of
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
* time, particularly in the full case.
|
|
|
|
*/
|
|
|
|
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
|
2009-09-12 04:11:19 +08:00
|
|
|
cache->last_byte_to_unpin = (u64)-1;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
cache->cached = BTRFS_CACHE_FINISHED;
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(cache);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
} else if (btrfs_block_group_used(&cache->item) == 0) {
|
2009-09-12 04:11:19 +08:00
|
|
|
cache->last_byte_to_unpin = (u64)-1;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
cache->cached = BTRFS_CACHE_FINISHED;
|
2018-05-10 20:44:45 +08:00
|
|
|
add_new_free_space(cache, found_key.objectid,
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
found_key.objectid +
|
|
|
|
found_key.offset);
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(cache);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
}
|
2007-10-16 04:15:19 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_add_block_group_cache(info, cache);
|
2013-04-03 00:40:42 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_remove_free_space_cache(cache);
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
trace_btrfs_add_block_group(info, cache, 0);
|
2019-06-19 04:09:19 +08:00
|
|
|
btrfs_update_space_info(info, cache->flags, found_key.offset,
|
|
|
|
btrfs_block_group_used(&cache->item),
|
|
|
|
cache->bytes_super, &space_info);
|
2013-04-03 00:40:42 +08:00
|
|
|
|
2008-03-25 03:01:59 +08:00
|
|
|
cache->space_info = space_info;
|
2009-09-12 04:11:20 +08:00
|
|
|
|
2017-08-21 17:43:50 +08:00
|
|
|
link_block_group(cache);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
set_avail_alloc_bits(info, cache->flags);
|
2016-06-23 06:54:24 +08:00
|
|
|
if (btrfs_chunk_readonly(info, cache->key.objectid)) {
|
2015-08-05 16:43:27 +08:00
|
|
|
inc_block_group_ro(cache, 1);
|
2014-09-18 23:20:02 +08:00
|
|
|
} else if (btrfs_block_group_used(&cache->item) == 0) {
|
2018-05-22 16:43:47 +08:00
|
|
|
ASSERT(list_empty(&cache->bg_list));
|
|
|
|
btrfs_mark_bg_unused(cache);
|
2014-09-18 23:20:02 +08:00
|
|
|
}
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
2010-05-16 22:46:24 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
list_for_each_entry_rcu(space_info, &info->space_info, list) {
|
2016-06-23 06:54:24 +08:00
|
|
|
if (!(get_alloc_profile(info, space_info->flags) &
|
2010-05-16 22:46:24 +08:00
|
|
|
(BTRFS_BLOCK_GROUP_RAID10 |
|
2019-05-31 21:39:31 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID1_MASK |
|
2019-05-31 22:54:26 +08:00
|
|
|
BTRFS_BLOCK_GROUP_RAID56_MASK |
|
2010-05-16 22:46:24 +08:00
|
|
|
BTRFS_BLOCK_GROUP_DUP)))
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* avoid allocating from un-mirrored block group if there are
|
|
|
|
* mirrored block groups.
|
|
|
|
*/
|
2013-07-16 14:58:56 +08:00
|
|
|
list_for_each_entry(cache,
|
|
|
|
&space_info->block_groups[BTRFS_RAID_RAID0],
|
|
|
|
list)
|
2015-08-05 16:43:27 +08:00
|
|
|
inc_block_group_ro(cache, 1);
|
2013-07-16 14:58:56 +08:00
|
|
|
list_for_each_entry(cache,
|
|
|
|
&space_info->block_groups[BTRFS_RAID_SINGLE],
|
|
|
|
list)
|
2015-08-05 16:43:27 +08:00
|
|
|
inc_block_group_ro(cache, 1);
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2019-06-20 01:47:23 +08:00
|
|
|
btrfs_init_global_block_rsv(info);
|
2018-08-01 10:37:17 +08:00
|
|
|
ret = check_chunk_block_group_mappings(info);
|
2008-03-25 03:01:56 +08:00
|
|
|
error:
|
2007-04-27 04:46:15 +08:00
|
|
|
btrfs_free_path(path);
|
2008-03-25 03:01:56 +08:00
|
|
|
return ret;
|
2007-04-27 04:46:15 +08:00
|
|
|
}
|
2008-03-25 03:01:59 +08:00
|
|
|
|
2018-02-07 23:55:40 +08:00
|
|
|
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
|
2012-09-12 04:57:25 +08:00
|
|
|
{
|
2018-02-07 23:55:40 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2018-09-28 19:18:02 +08:00
|
|
|
struct btrfs_block_group_cache *block_group;
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_root *extent_root = fs_info->extent_root;
|
2012-09-12 04:57:25 +08:00
|
|
|
struct btrfs_block_group_item item;
|
|
|
|
struct btrfs_key key;
|
|
|
|
int ret = 0;
|
|
|
|
|
2018-10-12 17:03:55 +08:00
|
|
|
if (!trans->can_flush_pending_bgs)
|
|
|
|
return;
|
|
|
|
|
2018-09-28 19:18:02 +08:00
|
|
|
while (!list_empty(&trans->new_bgs)) {
|
|
|
|
block_group = list_first_entry(&trans->new_bgs,
|
|
|
|
struct btrfs_block_group_cache,
|
|
|
|
bg_list);
|
2012-09-12 04:57:25 +08:00
|
|
|
if (ret)
|
2014-11-26 23:28:55 +08:00
|
|
|
goto next;
|
2012-09-12 04:57:25 +08:00
|
|
|
|
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
memcpy(&item, &block_group->item, sizeof(item));
|
|
|
|
memcpy(&key, &block_group->key, sizeof(key));
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
|
|
|
|
ret = btrfs_insert_item(trans, extent_root, &key, &item,
|
|
|
|
sizeof(item));
|
|
|
|
if (ret)
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2018-07-21 00:37:53 +08:00
|
|
|
ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
|
2013-06-28 01:22:46 +08:00
|
|
|
if (ret)
|
2016-06-11 06:19:25 +08:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2018-05-10 20:44:41 +08:00
|
|
|
add_block_group_free_space(trans, block_group);
|
2015-09-30 11:50:37 +08:00
|
|
|
/* already aborted the transaction if it failed. */
|
2014-11-26 23:28:55 +08:00
|
|
|
next:
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
btrfs_delayed_refs_rsv_release(fs_info, 1);
|
2014-11-26 23:28:55 +08:00
|
|
|
list_del_init(&block_group->bg_list);
|
2012-09-12 04:57:25 +08:00
|
|
|
}
|
2018-10-12 17:03:55 +08:00
|
|
|
btrfs_trans_release_chunk_metadata(trans);
|
2012-09-12 04:57:25 +08:00
|
|
|
}
|
|
|
|
|
2018-06-20 20:48:55 +08:00
|
|
|
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
|
2017-07-27 19:22:11 +08:00
|
|
|
u64 type, u64 chunk_offset, u64 size)
|
2008-03-25 03:01:59 +08:00
|
|
|
{
|
2018-06-20 20:48:55 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2008-03-25 03:01:59 +08:00
|
|
|
struct btrfs_block_group_cache *cache;
|
2016-06-23 06:54:23 +08:00
|
|
|
int ret;
|
2008-03-25 03:01:59 +08:00
|
|
|
|
2019-03-20 20:28:05 +08:00
|
|
|
btrfs_set_log_full_commit(trans);
|
2008-09-06 04:13:11 +08:00
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
|
Btrfs: free space accounting redo
1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size. The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing. If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible. When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.
2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset. also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.
3) cleaned up the allocation code to make it a little easier to read and a
little less complicated. Basically there are 3 steps, first look from our
provided hint. If we couldn't find from that given hint, start back at our
original search start and look for space from there. If that fails try to
allocate space if we can and start looking again. If not we're screwed and need
to start over again.
4) small fixes. there were some issues in volumes.c where we wouldn't allocate
the rest of the disk. fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space. Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space. Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups. The alloc_hint has fixed
this slight degredation and made things semi-normal.
There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space. This only happens with metadata
allocations, and only when we are almost full. So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall. I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-24 01:14:11 +08:00
|
|
|
if (!cache)
|
|
|
|
return -ENOMEM;
|
2011-03-29 13:46:06 +08:00
|
|
|
|
2008-03-25 03:01:59 +08:00
|
|
|
btrfs_set_block_group_used(&cache->item, bytes_used);
|
2017-07-27 19:22:11 +08:00
|
|
|
btrfs_set_block_group_chunk_objectid(&cache->item,
|
|
|
|
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
|
2008-03-25 03:01:59 +08:00
|
|
|
btrfs_set_block_group_flags(&cache->item, type);
|
|
|
|
|
2014-01-15 20:00:55 +08:00
|
|
|
cache->flags = type;
|
2009-09-12 04:11:19 +08:00
|
|
|
cache->last_byte_to_unpin = (u64)-1;
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
cache->cached = BTRFS_CACHE_FINISHED;
|
2015-09-30 11:50:37 +08:00
|
|
|
cache->needs_free_space = 1;
|
2018-06-20 20:49:09 +08:00
|
|
|
ret = exclude_super_stripes(cache);
|
2013-03-20 00:13:25 +08:00
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* We may have excluded something, so call this just in
|
|
|
|
* case.
|
|
|
|
*/
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(cache);
|
2014-01-15 20:00:55 +08:00
|
|
|
btrfs_put_block_group(cache);
|
2013-03-20 00:13:25 +08:00
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: use hybrid extents+bitmap rb tree for free space
Currently btrfs has a problem where it can use a ridiculous amount of RAM simply
tracking free space. As free space gets fragmented, we end up with thousands of
entries on an rb-tree per block group, which usually spans 1 gig of area. Since
we currently don't ever flush free space cache back to disk this gets to be a
bit unweildly on large fs's with lots of fragmentation.
This patch solves this problem by using PAGE_SIZE bitmaps for parts of the free
space cache. Initially we calculate a threshold of extent entries we can
handle, which is however many extent entries we can cram into 16k of ram. The
maximum amount of RAM that should ever be used to track 1 gigabyte of diskspace
will be 32k of RAM, which scales much better than we did before.
Once we pass the extent threshold, we start adding bitmaps and using those
instead for tracking the free space. This patch also makes it so that any free
space thats less than 4 * sectorsize we go ahead and put into a bitmap. This is
nice since we try and allocate out of the front of a block group, so if the
front of a block group is heavily fragmented and then has a huge chunk of free
space at the end, we go ahead and add the fragmented areas to bitmaps and use a
normal extent entry to track the big chunk at the back of the block group.
I've also taken the opportunity to revamp how we search for free space.
Previously we indexed free space via an offset indexed rb tree and a bytes
indexed rb tree. I've dropped the bytes indexed rb tree and use only the offset
indexed rb tree. This cuts the number of tree operations we were doing
previously down by half, and gives us a little bit of a better allocation
pattern since we will always start from a specific offset and search forward
from there, instead of searching for the size we need and try and get it as
close as possible to the offset we want.
I've given this a healthy amount of testing pre-new format stuff, as well as
post-new format stuff. I've booted up my fedora box which is installed on btrfs
with this patch and ran with it for a few days without issues. I've not seen
any performance regressions in any of my tests.
Since the last patch Yan Zheng fixed a problem where we could have overlapping
entries, so updating their offset inline would cause problems. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2018-05-10 20:44:45 +08:00
|
|
|
add_new_free_space(cache, chunk_offset, chunk_offset + size);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(cache);
|
2009-09-12 04:11:19 +08:00
|
|
|
|
2015-09-24 02:54:14 +08:00
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
2016-06-23 06:54:24 +08:00
|
|
|
if (btrfs_should_fragment_free_space(cache)) {
|
2015-09-24 02:54:14 +08:00
|
|
|
u64 new_bytes_used = size - bytes_used;
|
|
|
|
|
|
|
|
bytes_used += new_bytes_used >> 1;
|
2016-06-23 06:54:24 +08:00
|
|
|
fragment_free_space(cache);
|
2015-09-24 02:54:14 +08:00
|
|
|
}
|
|
|
|
#endif
|
2015-05-12 07:28:11 +08:00
|
|
|
/*
|
2017-05-22 14:35:49 +08:00
|
|
|
* Ensure the corresponding space_info object is created and
|
|
|
|
* assigned to our block group. We want our bg to be added to the rbtree
|
|
|
|
* with its ->space_info set.
|
2015-05-12 07:28:11 +08:00
|
|
|
*/
|
2019-06-19 04:09:19 +08:00
|
|
|
cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
|
2018-03-21 03:25:25 +08:00
|
|
|
ASSERT(cache->space_info);
|
2015-05-12 07:28:11 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
ret = btrfs_add_block_group_cache(fs_info, cache);
|
2013-04-03 00:40:42 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_remove_free_space_cache(cache);
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-05-12 07:28:11 +08:00
|
|
|
/*
|
|
|
|
* Now that our block group has its ->space_info set and is inserted in
|
|
|
|
* the rbtree, update the space info's counters.
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
trace_btrfs_add_block_group(fs_info, cache, 1);
|
2019-06-19 04:09:19 +08:00
|
|
|
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
|
2016-03-26 01:25:47 +08:00
|
|
|
cache->bytes_super, &cache->space_info);
|
2019-06-20 01:47:23 +08:00
|
|
|
btrfs_update_global_block_rsv(fs_info);
|
2009-09-12 04:11:20 +08:00
|
|
|
|
2017-08-21 17:43:50 +08:00
|
|
|
link_block_group(cache);
|
2008-03-25 03:01:59 +08:00
|
|
|
|
2014-09-18 23:20:02 +08:00
|
|
|
list_add_tail(&cache->bg_list, &trans->new_bgs);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
trans->delayed_ref_updates++;
|
|
|
|
btrfs_update_delayed_refs_rsv(trans);
|
2008-03-25 03:01:59 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
set_avail_alloc_bits(fs_info, type);
|
2008-03-25 03:01:59 +08:00
|
|
|
return 0;
|
|
|
|
}
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2012-01-17 04:04:47 +08:00
|
|
|
static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
|
|
|
|
{
|
2012-03-27 22:09:16 +08:00
|
|
|
u64 extra_flags = chunk_to_extended(flags) &
|
|
|
|
BTRFS_EXTENDED_PROFILE_MASK;
|
2012-01-17 04:04:47 +08:00
|
|
|
|
2013-01-29 18:13:12 +08:00
|
|
|
write_seqlock(&fs_info->profiles_lock);
|
2012-01-17 04:04:47 +08:00
|
|
|
if (flags & BTRFS_BLOCK_GROUP_DATA)
|
|
|
|
fs_info->avail_data_alloc_bits &= ~extra_flags;
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_METADATA)
|
|
|
|
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
|
|
|
|
fs_info->avail_system_alloc_bits &= ~extra_flags;
|
2013-01-29 18:13:12 +08:00
|
|
|
write_sequnlock(&fs_info->profiles_lock);
|
2012-01-17 04:04:47 +08:00
|
|
|
}
|
|
|
|
|
2019-06-11 20:31:01 +08:00
|
|
|
/*
|
|
|
|
* Clear incompat bits for the following feature(s):
|
|
|
|
*
|
|
|
|
* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
|
|
|
|
* in the whole filesystem
|
|
|
|
*/
|
|
|
|
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
|
|
|
|
{
|
|
|
|
if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) {
|
|
|
|
struct list_head *head = &fs_info->space_info;
|
|
|
|
struct btrfs_space_info *sinfo;
|
|
|
|
|
|
|
|
list_for_each_entry_rcu(sinfo, head, list) {
|
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
down_read(&sinfo->groups_sem);
|
|
|
|
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
|
|
|
|
found = true;
|
|
|
|
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
|
|
|
|
found = true;
|
|
|
|
up_read(&sinfo->groups_sem);
|
|
|
|
|
|
|
|
if (found)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
btrfs_clear_fs_incompat(fs_info, RAID56);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
2018-06-20 20:48:56 +08:00
|
|
|
u64 group_start, struct extent_map *em)
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
{
|
2018-06-20 20:48:56 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2016-06-22 09:16:51 +08:00
|
|
|
struct btrfs_root *root = fs_info->extent_root;
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
2009-06-05 03:34:51 +08:00
|
|
|
struct btrfs_free_cluster *cluster;
|
2016-06-23 06:54:23 +08:00
|
|
|
struct btrfs_root *tree_root = fs_info->tree_root;
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
struct btrfs_key key;
|
2010-06-22 02:48:16 +08:00
|
|
|
struct inode *inode;
|
2014-05-28 00:59:57 +08:00
|
|
|
struct kobject *kobj = NULL;
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
int ret;
|
2012-01-17 04:04:47 +08:00
|
|
|
int index;
|
2010-10-15 02:52:27 +08:00
|
|
|
int factor;
|
2014-11-26 23:28:51 +08:00
|
|
|
struct btrfs_caching_control *caching_ctl = NULL;
|
Btrfs: fix race between fs trimming and block group remove/allocation
Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard completes
it is added back to the free space rbtree.
If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.
So fix this by keeping the extent map in memory until the discard completes
so that the same physical addresses aren't reused before it completes.
If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens) we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:
checking extents
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
owner ref check failed [833912832 16384]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
root 5 root dir 256 error
root 5 inode 260 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 262 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 263 errors 2001, no inode item, link count wrong
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-28 05:14:15 +08:00
|
|
|
bool remove_em;
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
bool remove_rsv = false;
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2016-06-22 09:16:51 +08:00
|
|
|
block_group = btrfs_lookup_block_group(fs_info, group_start);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
BUG_ON(!block_group);
|
2008-11-13 03:34:12 +08:00
|
|
|
BUG_ON(!block_group->ro);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2018-04-26 17:17:20 +08:00
|
|
|
trace_btrfs_remove_block_group(block_group);
|
2011-03-07 10:13:33 +08:00
|
|
|
/*
|
|
|
|
* Free the reserved super bytes from this block group before
|
|
|
|
* remove it.
|
|
|
|
*/
|
2018-06-20 20:49:08 +08:00
|
|
|
free_excluded_extents(block_group);
|
2017-09-30 03:43:50 +08:00
|
|
|
btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
|
|
|
|
block_group->key.offset);
|
2011-03-07 10:13:33 +08:00
|
|
|
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
memcpy(&key, &block_group->key, sizeof(key));
|
2018-01-30 18:20:45 +08:00
|
|
|
index = btrfs_bg_flags_to_raid_index(block_group->flags);
|
2018-07-14 02:46:30 +08:00
|
|
|
factor = btrfs_bg_type_to_factor(block_group->flags);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2009-06-05 03:34:51 +08:00
|
|
|
/* make sure this block group isn't part of an allocation cluster */
|
2016-06-23 06:54:23 +08:00
|
|
|
cluster = &fs_info->data_alloc_cluster;
|
2009-06-05 03:34:51 +08:00
|
|
|
spin_lock(&cluster->refill_lock);
|
|
|
|
btrfs_return_cluster_to_free_space(block_group, cluster);
|
|
|
|
spin_unlock(&cluster->refill_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* make sure this block group isn't part of a metadata
|
|
|
|
* allocation cluster
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
cluster = &fs_info->meta_alloc_cluster;
|
2009-06-05 03:34:51 +08:00
|
|
|
spin_lock(&cluster->refill_lock);
|
|
|
|
btrfs_return_cluster_to_free_space(block_group, cluster);
|
|
|
|
spin_unlock(&cluster->refill_lock);
|
|
|
|
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2015-04-07 03:46:08 +08:00
|
|
|
/*
|
|
|
|
* get the inode first so any iput calls done for the io_list
|
|
|
|
* aren't the final iput (no unlinks allowed now)
|
|
|
|
*/
|
2019-03-20 20:40:19 +08:00
|
|
|
inode = lookup_free_space_inode(block_group, path);
|
2015-04-07 03:46:08 +08:00
|
|
|
|
|
|
|
mutex_lock(&trans->transaction->cache_write_mutex);
|
|
|
|
/*
|
2018-11-28 19:05:13 +08:00
|
|
|
* Make sure our free space cache IO is done before removing the
|
2015-04-07 03:46:08 +08:00
|
|
|
* free space inode
|
|
|
|
*/
|
|
|
|
spin_lock(&trans->transaction->dirty_bgs_lock);
|
|
|
|
if (!list_empty(&block_group->io_list)) {
|
|
|
|
list_del_init(&block_group->io_list);
|
|
|
|
|
|
|
|
WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
|
|
|
|
|
|
|
|
spin_unlock(&trans->transaction->dirty_bgs_lock);
|
2016-09-10 00:09:35 +08:00
|
|
|
btrfs_wait_cache_io(trans, block_group, path);
|
2015-04-07 03:46:08 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
spin_lock(&trans->transaction->dirty_bgs_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!list_empty(&block_group->dirty_list)) {
|
|
|
|
list_del_init(&block_group->dirty_list);
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
remove_rsv = true;
|
2015-04-07 03:46:08 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
}
|
|
|
|
spin_unlock(&trans->transaction->dirty_bgs_lock);
|
|
|
|
mutex_unlock(&trans->transaction->cache_write_mutex);
|
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
if (!IS_ERR(inode)) {
|
2017-02-20 19:50:59 +08:00
|
|
|
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_add_delayed_iput(inode);
|
|
|
|
goto out;
|
|
|
|
}
|
2010-06-22 02:48:16 +08:00
|
|
|
clear_nlink(inode);
|
|
|
|
/* One for the block groups ref */
|
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
if (block_group->iref) {
|
|
|
|
block_group->iref = 0;
|
|
|
|
block_group->inode = NULL;
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
iput(inode);
|
|
|
|
} else {
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
}
|
|
|
|
/* One for our lookup ref */
|
2011-09-20 00:26:24 +08:00
|
|
|
btrfs_add_delayed_iput(inode);
|
2010-06-22 02:48:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
|
|
|
|
key.offset = block_group->key.objectid;
|
|
|
|
key.type = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret > 0)
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2010-06-22 02:48:16 +08:00
|
|
|
if (ret == 0) {
|
|
|
|
ret = btrfs_del_item(trans, tree_root, path);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2010-06-22 02:48:16 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
spin_lock(&fs_info->block_group_cache_lock);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
rb_erase(&block_group->cache_node,
|
2016-06-23 06:54:23 +08:00
|
|
|
&fs_info->block_group_cache_tree);
|
2014-11-26 23:28:50 +08:00
|
|
|
RB_CLEAR_NODE(&block_group->cache_node);
|
2012-12-27 17:01:23 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (fs_info->first_logical_byte == block_group->key.objectid)
|
|
|
|
fs_info->first_logical_byte = (u64)-1;
|
|
|
|
spin_unlock(&fs_info->block_group_cache_lock);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
down_write(&block_group->space_info->groups_sem);
|
2009-06-05 03:34:51 +08:00
|
|
|
/*
|
|
|
|
* we must use list_del_init so people can check to see if they
|
|
|
|
* are still on the list after taking the semaphore
|
|
|
|
*/
|
|
|
|
list_del_init(&block_group->list);
|
2013-11-02 01:07:04 +08:00
|
|
|
if (list_empty(&block_group->space_info->block_groups[index])) {
|
2014-05-28 00:59:57 +08:00
|
|
|
kobj = block_group->space_info->block_group_kobjs[index];
|
|
|
|
block_group->space_info->block_group_kobjs[index] = NULL;
|
2016-06-23 06:54:23 +08:00
|
|
|
clear_avail_alloc_bits(fs_info, block_group->flags);
|
2013-11-02 01:07:04 +08:00
|
|
|
}
|
Btrfs: fix enospc when there is plenty of space
So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had. It only happens with Metadata writes, and happens _very_
infrequently. What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start. We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP. We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.
If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of. This patch completely kills find_free_space and
rolls it into find_free_extent. I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.
The basic flow is this: We have the variable loop which is 0, meaning we are
in the hint phase. We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of. If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.
This is also where we add the empty_cluster to total_needed. At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups. If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good. If not we start
over at space_info->block_groups and loop through again, with loop == 2. If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.
Also I've made a groups_sem to handle the group list for the space_info. This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time. Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
2008-10-30 02:49:05 +08:00
|
|
|
up_write(&block_group->space_info->groups_sem);
|
2019-06-11 20:31:01 +08:00
|
|
|
clear_incompat_bg_bits(fs_info, block_group->flags);
|
2014-05-28 00:59:57 +08:00
|
|
|
if (kobj) {
|
|
|
|
kobject_del(kobj);
|
|
|
|
kobject_put(kobj);
|
|
|
|
}
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
2014-11-26 23:28:51 +08:00
|
|
|
if (block_group->has_caching_ctl)
|
|
|
|
caching_ctl = get_caching_control(block_group);
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
if (block_group->cached == BTRFS_CACHE_STARTED)
|
2009-09-12 04:11:19 +08:00
|
|
|
wait_block_group_cache_done(block_group);
|
2014-11-26 23:28:51 +08:00
|
|
|
if (block_group->has_caching_ctl) {
|
2016-06-23 06:54:23 +08:00
|
|
|
down_write(&fs_info->commit_root_sem);
|
2014-11-26 23:28:51 +08:00
|
|
|
if (!caching_ctl) {
|
|
|
|
struct btrfs_caching_control *ctl;
|
|
|
|
|
|
|
|
list_for_each_entry(ctl,
|
2016-06-23 06:54:23 +08:00
|
|
|
&fs_info->caching_block_groups, list)
|
2014-11-26 23:28:51 +08:00
|
|
|
if (ctl->block_group == block_group) {
|
|
|
|
caching_ctl = ctl;
|
2017-03-03 16:55:14 +08:00
|
|
|
refcount_inc(&caching_ctl->count);
|
2014-11-26 23:28:51 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (caching_ctl)
|
|
|
|
list_del_init(&caching_ctl->list);
|
2016-06-23 06:54:23 +08:00
|
|
|
up_write(&fs_info->commit_root_sem);
|
2014-11-26 23:28:51 +08:00
|
|
|
if (caching_ctl) {
|
|
|
|
/* Once for the caching bgs list and once for us. */
|
|
|
|
put_caching_control(caching_ctl);
|
|
|
|
put_caching_control(caching_ctl);
|
|
|
|
}
|
|
|
|
}
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
|
2014-11-18 04:45:48 +08:00
|
|
|
spin_lock(&trans->transaction->dirty_bgs_lock);
|
2019-01-30 22:50:49 +08:00
|
|
|
WARN_ON(!list_empty(&block_group->dirty_list));
|
|
|
|
WARN_ON(!list_empty(&block_group->io_list));
|
2014-11-18 04:45:48 +08:00
|
|
|
spin_unlock(&trans->transaction->dirty_bgs_lock);
|
2019-01-30 22:50:49 +08:00
|
|
|
|
Btrfs: async block group caching
This patch moves the caching of the block group off to a kthread in order to
allow people to allocate sooner. Instead of blocking up behind the caching
mutex, we instead kick of the caching kthread, and then attempt to make an
allocation. If we cannot, we wait on the block groups caching waitqueue, which
the caching kthread will wake the waiting threads up everytime it finds 2 meg
worth of space, and then again when its finished caching. This is how I tested
the speedup from this
mkfs the disk
mount the disk
fill the disk up with fs_mark
unmount the disk
mount the disk
time touch /mnt/foo
Without my changes this took 11 seconds on my box, with these changes it now
takes 1 second.
Another change thats been put in place is we lock the super mirror's in the
pinned extent map in order to keep us from adding that stuff as free space when
caching the block group. This doesn't really change anything else as far as the
pinned extent map is concerned, since for actual pinned extents we use
EXTENT_DIRTY, but it does mean that when we unmount we have to go in and unlock
those extents to keep from leaking memory.
I've also added a check where when we are reading block groups from disk, if the
amount of space used == the size of the block group, we go ahead and mark the
block group as cached. This drastically reduces the amount of time it takes to
cache the block groups. Using the same test as above, except doing a dd to a
file and then unmounting, it used to take 33 seconds to umount, now it takes 3
seconds.
This version uses the commit_root in the caching kthread, and then keeps track
of how many async caching threads are running at any given time so if one of the
async threads is still running as we cross transactions we can wait until its
finished before handling the pinned extents. Thank you,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-07-14 09:29:25 +08:00
|
|
|
btrfs_remove_free_space_cache(block_group);
|
|
|
|
|
2008-11-13 03:34:12 +08:00
|
|
|
spin_lock(&block_group->space_info->lock);
|
2015-01-16 21:24:40 +08:00
|
|
|
list_del_init(&block_group->ro_list);
|
2015-02-24 20:07:44 +08:00
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
|
2015-02-24 20:07:44 +08:00
|
|
|
WARN_ON(block_group->space_info->total_bytes
|
|
|
|
< block_group->key.offset);
|
|
|
|
WARN_ON(block_group->space_info->bytes_readonly
|
|
|
|
< block_group->key.offset);
|
|
|
|
WARN_ON(block_group->space_info->disk_total
|
|
|
|
< block_group->key.offset * factor);
|
|
|
|
}
|
2008-11-13 03:34:12 +08:00
|
|
|
block_group->space_info->total_bytes -= block_group->key.offset;
|
|
|
|
block_group->space_info->bytes_readonly -= block_group->key.offset;
|
2010-10-15 02:52:27 +08:00
|
|
|
block_group->space_info->disk_total -= block_group->key.offset * factor;
|
2015-02-24 20:07:44 +08:00
|
|
|
|
2008-11-13 03:34:12 +08:00
|
|
|
spin_unlock(&block_group->space_info->lock);
|
2009-07-25 04:30:55 +08:00
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
memcpy(&key, &block_group->key, sizeof(key));
|
|
|
|
|
2016-10-05 01:34:27 +08:00
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
Btrfs: fix race between fs trimming and block group remove/allocation
Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard completes
it is added back to the free space rbtree.
If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.
So fix this by keeping the extent map in memory until the discard completes
so that the same physical addresses aren't reused before it completes.
If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens) we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:
checking extents
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
owner ref check failed [833912832 16384]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
root 5 root dir 256 error
root 5 inode 260 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 262 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 263 errors 2001, no inode item, link count wrong
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-28 05:14:15 +08:00
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
block_group->removed = 1;
|
|
|
|
/*
|
|
|
|
* At this point trimming can't start on this block group, because we
|
|
|
|
* removed the block group from the tree fs_info->block_group_cache_tree
|
|
|
|
* so no one can't find it anymore and even if someone already got this
|
|
|
|
* block group before we removed it from the rbtree, they have already
|
|
|
|
* incremented block_group->trimming - if they didn't, they won't find
|
|
|
|
* any free space entries because we already removed them all when we
|
|
|
|
* called btrfs_remove_free_space_cache().
|
|
|
|
*
|
|
|
|
* And we must not remove the extent map from the fs_info->mapping_tree
|
|
|
|
* to prevent the same logical address range and physical device space
|
|
|
|
* ranges from being reused for a new block group. This is because our
|
|
|
|
* fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
|
|
|
|
* completely transactionless, so while it is trimming a range the
|
|
|
|
* currently running transaction might finish and a new one start,
|
|
|
|
* allowing for new block groups to be created that can reuse the same
|
|
|
|
* physical device locations unless we take this special care.
|
2015-06-15 21:41:19 +08:00
|
|
|
*
|
|
|
|
* There may also be an implicit trim operation if the file system
|
|
|
|
* is mounted with -odiscard. The same protections must remain
|
|
|
|
* in place until the extents have been discarded completely when
|
|
|
|
* the transaction commit has completed.
|
Btrfs: fix race between fs trimming and block group remove/allocation
Our fs trim operation, which is completely transactionless (doesn't start
or joins an existing transaction) consists of visiting all block groups
and then for each one to iterate its free space entries and perform a
discard operation against the space range represented by the free space
entries. However before performing a discard, the corresponding free space
entry is removed from the free space rbtree, and when the discard completes
it is added back to the free space rbtree.
If a block group remove operation happens while the discard is ongoing (or
before it starts and after a free space entry is hidden), we end up not
waiting for the discard to complete, remove the extent map that maps
logical address to physical addresses and the corresponding chunk metadata
from the the chunk and device trees. After that and before the discard
completes, the current running transaction can finish and a new one start,
allowing for new block groups that map to the same physical addresses to
be allocated and written to.
So fix this by keeping the extent map in memory until the discard completes
so that the same physical addresses aren't reused before it completes.
If the physical locations that are under a discard operation end up being
used for a new metadata block group for example, and dirty metadata extents
are written before the discard finishes (the VM might call writepages() of
our btree inode's i_mapping for example, or an fsync log commit happens) we
end up overwriting metadata with zeroes, which leads to errors from fsck
like the following:
checking extents
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
owner ref check failed [833912832 16384]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
Check tree block failed, want=833912832, have=0
read block failed check_tree_block
root 5 root dir 256 error
root 5 inode 260 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_3 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 262 errors 2001, no inode item, link count wrong
unresolved ref dir 256 index 0 namelen 8 name foobar_5 filetype 1 errors 6, no dir index, no inode ref
root 5 inode 263 errors 2001, no inode item, link count wrong
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-28 05:14:15 +08:00
|
|
|
*/
|
|
|
|
remove_em = (atomic_read(&block_group->trimming) == 0);
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
|
2016-10-05 01:34:27 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2014-12-03 02:07:49 +08:00
|
|
|
|
2018-05-10 20:44:46 +08:00
|
|
|
ret = remove_block_group_free_space(trans, block_group);
|
2015-09-30 11:50:37 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2009-04-03 21:47:43 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
btrfs_put_block_group(block_group);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -EIO;
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, root, path);
|
Btrfs: fix race between block group removal and block group allocation
If a task is removing the block group that currently has the highest start
offset amongst all existing block groups, there is a short time window
where it races with a concurrent block group allocation, resulting in a
transaction abort with an error code of EEXIST.
The following diagram explains the race in detail:
Task A Task B
btrfs_remove_block_group(bg offset X)
remove_extent_mapping(em offset X)
-> removes extent map X from the
tree of extent maps
(fs_info->mapping_tree), so the
next call to find_next_chunk()
will return offset X
btrfs_alloc_chunk()
find_next_chunk()
--> returns offset X
__btrfs_alloc_chunk(offset X)
btrfs_make_block_group()
btrfs_create_block_group_cache()
--> creates btrfs_block_group_cache
object with a key corresponding
to the block group item in the
extent, the key is:
(offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
--> adds the btrfs_block_group_cache object
to the list new_bgs of the transaction
handle
btrfs_end_transaction(trans handle)
__btrfs_end_transaction()
btrfs_create_pending_block_groups()
--> sees the new btrfs_block_group_cache
in the new_bgs list of the transaction
handle
--> its call to btrfs_insert_item() fails
with -EEXIST when attempting to insert
the block group item key
(offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
because task A has not removed that key yet
--> aborts the running transaction with
error -EEXIST
btrfs_del_item()
-> removes the block group's key from
the extent tree, key is
(offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
A sample transaction abort trace:
[78912.403537] ------------[ cut here ]------------
[78912.403811] BTRFS: Transaction aborted (error -17)
[78912.404082] WARNING: CPU: 2 PID: 20465 at fs/btrfs/extent-tree.c:10551 btrfs_create_pending_block_groups+0x196/0x250 [btrfs]
(...)
[78912.405642] CPU: 2 PID: 20465 Comm: btrfs Tainted: G W 5.0.0-btrfs-next-46 #1
[78912.405941] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[78912.406586] RIP: 0010:btrfs_create_pending_block_groups+0x196/0x250 [btrfs]
(...)
[78912.407636] RSP: 0018:ffff9d3d4b7e3b08 EFLAGS: 00010282
[78912.407997] RAX: 0000000000000000 RBX: ffff90959a3796f0 RCX: 0000000000000006
[78912.408369] RDX: 0000000000000007 RSI: 0000000000000001 RDI: ffff909636b16860
[78912.408746] RBP: ffff909626758a58 R08: 0000000000000000 R09: 0000000000000000
[78912.409144] R10: ffff9095ff462400 R11: 0000000000000000 R12: ffff90959a379588
[78912.409521] R13: ffff909626758ab0 R14: ffff9095036c0000 R15: ffff9095299e1158
[78912.409899] FS: 00007f387f16f700(0000) GS:ffff909636b00000(0000) knlGS:0000000000000000
[78912.410285] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[78912.410673] CR2: 00007f429fc87cbc CR3: 000000014440a004 CR4: 00000000003606e0
[78912.411095] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[78912.411496] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[78912.411898] Call Trace:
[78912.412318] __btrfs_end_transaction+0x5b/0x1c0 [btrfs]
[78912.412746] btrfs_inc_block_group_ro+0xcf/0x160 [btrfs]
[78912.413179] scrub_enumerate_chunks+0x188/0x5b0 [btrfs]
[78912.413622] ? __mutex_unlock_slowpath+0x100/0x2a0
[78912.414078] btrfs_scrub_dev+0x2ef/0x720 [btrfs]
[78912.414535] ? __sb_start_write+0xd4/0x1c0
[78912.414963] ? mnt_want_write_file+0x24/0x50
[78912.415403] btrfs_ioctl+0x17fb/0x3120 [btrfs]
[78912.415832] ? lock_acquire+0xa6/0x190
[78912.416256] ? do_vfs_ioctl+0xa2/0x6f0
[78912.416685] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[78912.417116] do_vfs_ioctl+0xa2/0x6f0
[78912.417534] ? __fget+0x113/0x200
[78912.417954] ksys_ioctl+0x70/0x80
[78912.418369] __x64_sys_ioctl+0x16/0x20
[78912.418812] do_syscall_64+0x60/0x1b0
[78912.419231] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[78912.419644] RIP: 0033:0x7f3880252dd7
(...)
[78912.420957] RSP: 002b:00007f387f16ed68 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[78912.421426] RAX: ffffffffffffffda RBX: 000055f5becc1df0 RCX: 00007f3880252dd7
[78912.421889] RDX: 000055f5becc1df0 RSI: 00000000c400941b RDI: 0000000000000003
[78912.422354] RBP: 0000000000000000 R08: 00007f387f16f700 R09: 0000000000000000
[78912.422790] R10: 00007f387f16f700 R11: 0000000000000246 R12: 0000000000000000
[78912.423202] R13: 00007ffda49c266f R14: 0000000000000000 R15: 00007f388145e040
[78912.425505] ---[ end trace eb9bfe7c426fc4d3 ]---
Fix this by calling remove_extent_mapping(), at btrfs_remove_block_group(),
only at the very end, after removing the block group item key from the
extent tree (and removing the free space tree entry if we are using the
free space tree feature).
Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-12 18:05:42 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (remove_em) {
|
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
|
2019-05-17 17:43:17 +08:00
|
|
|
em_tree = &fs_info->mapping_tree;
|
Btrfs: fix race between block group removal and block group allocation
If a task is removing the block group that currently has the highest start
offset amongst all existing block groups, there is a short time window
where it races with a concurrent block group allocation, resulting in a
transaction abort with an error code of EEXIST.
The following diagram explains the race in detail:
Task A Task B
btrfs_remove_block_group(bg offset X)
remove_extent_mapping(em offset X)
-> removes extent map X from the
tree of extent maps
(fs_info->mapping_tree), so the
next call to find_next_chunk()
will return offset X
btrfs_alloc_chunk()
find_next_chunk()
--> returns offset X
__btrfs_alloc_chunk(offset X)
btrfs_make_block_group()
btrfs_create_block_group_cache()
--> creates btrfs_block_group_cache
object with a key corresponding
to the block group item in the
extent, the key is:
(offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
--> adds the btrfs_block_group_cache object
to the list new_bgs of the transaction
handle
btrfs_end_transaction(trans handle)
__btrfs_end_transaction()
btrfs_create_pending_block_groups()
--> sees the new btrfs_block_group_cache
in the new_bgs list of the transaction
handle
--> its call to btrfs_insert_item() fails
with -EEXIST when attempting to insert
the block group item key
(offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
because task A has not removed that key yet
--> aborts the running transaction with
error -EEXIST
btrfs_del_item()
-> removes the block group's key from
the extent tree, key is
(offset X, BTRFS_BLOCK_GROUP_ITEM_KEY, 1G)
A sample transaction abort trace:
[78912.403537] ------------[ cut here ]------------
[78912.403811] BTRFS: Transaction aborted (error -17)
[78912.404082] WARNING: CPU: 2 PID: 20465 at fs/btrfs/extent-tree.c:10551 btrfs_create_pending_block_groups+0x196/0x250 [btrfs]
(...)
[78912.405642] CPU: 2 PID: 20465 Comm: btrfs Tainted: G W 5.0.0-btrfs-next-46 #1
[78912.405941] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
[78912.406586] RIP: 0010:btrfs_create_pending_block_groups+0x196/0x250 [btrfs]
(...)
[78912.407636] RSP: 0018:ffff9d3d4b7e3b08 EFLAGS: 00010282
[78912.407997] RAX: 0000000000000000 RBX: ffff90959a3796f0 RCX: 0000000000000006
[78912.408369] RDX: 0000000000000007 RSI: 0000000000000001 RDI: ffff909636b16860
[78912.408746] RBP: ffff909626758a58 R08: 0000000000000000 R09: 0000000000000000
[78912.409144] R10: ffff9095ff462400 R11: 0000000000000000 R12: ffff90959a379588
[78912.409521] R13: ffff909626758ab0 R14: ffff9095036c0000 R15: ffff9095299e1158
[78912.409899] FS: 00007f387f16f700(0000) GS:ffff909636b00000(0000) knlGS:0000000000000000
[78912.410285] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[78912.410673] CR2: 00007f429fc87cbc CR3: 000000014440a004 CR4: 00000000003606e0
[78912.411095] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[78912.411496] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[78912.411898] Call Trace:
[78912.412318] __btrfs_end_transaction+0x5b/0x1c0 [btrfs]
[78912.412746] btrfs_inc_block_group_ro+0xcf/0x160 [btrfs]
[78912.413179] scrub_enumerate_chunks+0x188/0x5b0 [btrfs]
[78912.413622] ? __mutex_unlock_slowpath+0x100/0x2a0
[78912.414078] btrfs_scrub_dev+0x2ef/0x720 [btrfs]
[78912.414535] ? __sb_start_write+0xd4/0x1c0
[78912.414963] ? mnt_want_write_file+0x24/0x50
[78912.415403] btrfs_ioctl+0x17fb/0x3120 [btrfs]
[78912.415832] ? lock_acquire+0xa6/0x190
[78912.416256] ? do_vfs_ioctl+0xa2/0x6f0
[78912.416685] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
[78912.417116] do_vfs_ioctl+0xa2/0x6f0
[78912.417534] ? __fget+0x113/0x200
[78912.417954] ksys_ioctl+0x70/0x80
[78912.418369] __x64_sys_ioctl+0x16/0x20
[78912.418812] do_syscall_64+0x60/0x1b0
[78912.419231] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[78912.419644] RIP: 0033:0x7f3880252dd7
(...)
[78912.420957] RSP: 002b:00007f387f16ed68 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[78912.421426] RAX: ffffffffffffffda RBX: 000055f5becc1df0 RCX: 00007f3880252dd7
[78912.421889] RDX: 000055f5becc1df0 RSI: 00000000c400941b RDI: 0000000000000003
[78912.422354] RBP: 0000000000000000 R08: 00007f387f16f700 R09: 0000000000000000
[78912.422790] R10: 00007f387f16f700 R11: 0000000000000246 R12: 0000000000000000
[78912.423202] R13: 00007ffda49c266f R14: 0000000000000000 R15: 00007f388145e040
[78912.425505] ---[ end trace eb9bfe7c426fc4d3 ]---
Fix this by calling remove_extent_mapping(), at btrfs_remove_block_group(),
only at the very end, after removing the block group item key from the
extent tree (and removing the free space tree entry if we are using the
free space tree feature).
Fixes: 04216820fe83d5 ("Btrfs: fix race between fs trimming and block group remove/allocation")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-06-12 18:05:42 +08:00
|
|
|
write_lock(&em_tree->lock);
|
|
|
|
remove_extent_mapping(em_tree, em);
|
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
/* once for the tree */
|
|
|
|
free_extent_map(em);
|
|
|
|
}
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
out:
|
btrfs: introduce delayed_refs_rsv
Traditionally we've had voodoo in btrfs to account for the space that
delayed refs may take up by having a global_block_rsv. This works most
of the time, except when it doesn't. We've had issues reported and seen
in production where sometimes the global reserve is exhausted during
transaction commit before we can run all of our delayed refs, resulting
in an aborted transaction. Because of this voodoo we have equally
dubious flushing semantics around throttling delayed refs which we often
get wrong.
So instead give them their own block_rsv. This way we can always know
exactly how much outstanding space we need for delayed refs. This
allows us to make sure we are constantly filling that reservation up
with space, and allows us to put more precise pressure on the enospc
system. Instead of doing math to see if its a good time to throttle,
the normal enospc code will be invoked if we have a lot of delayed refs
pending, and they will be run via the normal flushing mechanism.
For now the delayed_refs_rsv will hold the reservations for the delayed
refs, the block group updates, and deleting csums. We could have a
separate rsv for the block group updates, but the csum deletion stuff is
still handled via the delayed_refs so that will stay there.
Historical background:
The global reserve has grown to cover everything we don't reserve space
explicitly for, and we've grown a lot of weird ad-hoc heuristics to know
if we're running short on space and when it's time to force a commit. A
failure rate of 20-40 file systems when we run hundreds of thousands of
them isn't super high, but cleaning up this code will make things less
ugly and more predictible.
Thus the delayed refs rsv. We always know how many delayed refs we have
outstanding, and although running them generates more we can use the
global reserve for that spill over, which fits better into it's desired
use than a full blown reservation. This first approach is to simply
take how many times we're reserving space for and multiply that by 2 in
order to save enough space for the delayed refs that could be generated.
This is a niave approach and will probably evolve, but for now it works.
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com> # high-level review
[ added background notes from the cover letter ]
Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-03 23:20:33 +08:00
|
|
|
if (remove_rsv)
|
|
|
|
btrfs_delayed_refs_rsv_release(fs_info, 1);
|
Btrfs: update space balancing code
This patch updates the space balancing code to utilize the new
backref format. Before, btrfs-vol -b would break any COW links
on data blocks or metadata. This was slow and caused the amount
of space used to explode if a large number of snapshots were present.
The new code can keeps the sharing of all data extents and
most of the tree blocks.
To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.
To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).
To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-09-26 22:09:34 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
2011-01-06 19:30:25 +08:00
|
|
|
|
2015-11-14 07:57:16 +08:00
|
|
|
struct btrfs_trans_handle *
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
|
|
|
|
const u64 chunk_offset)
|
2015-11-14 07:57:16 +08:00
|
|
|
{
|
2019-05-17 17:43:17 +08:00
|
|
|
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
struct extent_map *em;
|
|
|
|
struct map_lookup *map;
|
|
|
|
unsigned int num_items;
|
|
|
|
|
|
|
|
read_lock(&em_tree->lock);
|
|
|
|
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
|
|
|
|
read_unlock(&em_tree->lock);
|
|
|
|
ASSERT(em && em->start == chunk_offset);
|
|
|
|
|
2015-11-14 07:57:16 +08:00
|
|
|
/*
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
* We need to reserve 3 + N units from the metadata space info in order
|
|
|
|
* to remove a block group (done at btrfs_remove_chunk() and at
|
|
|
|
* btrfs_remove_block_group()), which are used for:
|
|
|
|
*
|
2015-11-14 07:57:16 +08:00
|
|
|
* 1 unit for adding the free space inode's orphan (located in the tree
|
|
|
|
* of tree roots).
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
* 1 unit for deleting the block group item (located in the extent
|
|
|
|
* tree).
|
|
|
|
* 1 unit for deleting the free space item (located in tree of tree
|
|
|
|
* roots).
|
|
|
|
* N units for deleting N device extent items corresponding to each
|
|
|
|
* stripe (located in the device tree).
|
|
|
|
*
|
|
|
|
* In order to remove a block group we also need to reserve units in the
|
|
|
|
* system space info in order to update the chunk tree (update one or
|
|
|
|
* more device items and remove one chunk item), but this is done at
|
|
|
|
* btrfs_remove_chunk() through a call to check_system_chunk().
|
2015-11-14 07:57:16 +08:00
|
|
|
*/
|
2015-06-03 22:55:48 +08:00
|
|
|
map = em->map_lookup;
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
num_items = 3 + map->num_stripes;
|
|
|
|
free_extent_map(em);
|
|
|
|
|
2015-11-14 07:57:16 +08:00
|
|
|
return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
num_items, 1);
|
2015-11-14 07:57:16 +08:00
|
|
|
}
|
|
|
|
|
2014-09-18 23:20:02 +08:00
|
|
|
/*
|
|
|
|
* Process the unused_bgs list and remove any that don't have any allocated
|
|
|
|
* space inside of them.
|
|
|
|
*/
|
|
|
|
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *block_group;
|
|
|
|
struct btrfs_space_info *space_info;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
|
|
|
|
2016-09-03 03:40:02 +08:00
|
|
|
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
|
2014-09-18 23:20:02 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->unused_bgs_lock);
|
|
|
|
while (!list_empty(&fs_info->unused_bgs)) {
|
|
|
|
u64 start, end;
|
2015-06-15 21:41:19 +08:00
|
|
|
int trimming;
|
2014-09-18 23:20:02 +08:00
|
|
|
|
|
|
|
block_group = list_first_entry(&fs_info->unused_bgs,
|
|
|
|
struct btrfs_block_group_cache,
|
|
|
|
bg_list);
|
|
|
|
list_del_init(&block_group->bg_list);
|
2015-09-29 21:03:54 +08:00
|
|
|
|
|
|
|
space_info = block_group->space_info;
|
|
|
|
|
2014-09-18 23:20:02 +08:00
|
|
|
if (ret || btrfs_mixed_space_info(space_info)) {
|
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->unused_bgs_lock);
|
|
|
|
|
2015-10-08 18:46:44 +08:00
|
|
|
mutex_lock(&fs_info->delete_unused_bgs_mutex);
|
Btrfs: fix race between balance and unused block group deletion
We have a race between deleting an unused block group and balancing the
same block group that leads to an assertion failure/BUG(), producing the
following trace:
[181631.208236] BTRFS: assertion failed: 0, file: fs/btrfs/volumes.c, line: 2622
[181631.220591] ------------[ cut here ]------------
[181631.222959] kernel BUG at fs/btrfs/ctree.h:4062!
[181631.223932] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[181631.224566] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq parpor$
[181631.224566] CPU: 8 PID: 17451 Comm: btrfs Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1
[181631.224566] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[181631.224566] task: ffff880127e09590 ti: ffff8800b5824000 task.ti: ffff8800b5824000
[181631.224566] RIP: 0010:[<ffffffffa03f19f6>] [<ffffffffa03f19f6>] assfail.constprop.50+0x1e/0x20 [btrfs]
[181631.224566] RSP: 0018:ffff8800b5827ae8 EFLAGS: 00010246
[181631.224566] RAX: 0000000000000040 RBX: ffff8800109fc218 RCX: ffffffff81095dce
[181631.224566] RDX: 0000000000005124 RSI: ffffffff81464819 RDI: 00000000ffffffff
[181631.224566] RBP: ffff8800b5827ae8 R08: 0000000000000001 R09: 0000000000000000
[181631.224566] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800109fc200
[181631.224566] R13: ffff880020095000 R14: ffff8800b1a13f38 R15: ffff880020095000
[181631.224566] FS: 00007f70ca0b0c80(0000) GS:ffff88013ec00000(0000) knlGS:0000000000000000
[181631.224566] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[181631.224566] CR2: 00007f2872ab6e68 CR3: 00000000a717c000 CR4: 00000000000006e0
[181631.224566] Stack:
[181631.224566] ffff8800b5827ba8 ffffffffa03f3916 ffff8800b5827b38 ffffffffa03d080e
[181631.224566] ffffffffa03d1423 ffff880020095000 ffff88001233c000 0000000000000001
[181631.224566] ffff880020095000 ffff8800b1a13f38 0000000a69c00000 0000000000000000
[181631.224566] Call Trace:
[181631.224566] [<ffffffffa03f3916>] btrfs_remove_chunk+0xa4/0x6bb [btrfs]
[181631.224566] [<ffffffffa03d080e>] ? join_transaction.isra.8+0xb9/0x3ba [btrfs]
[181631.224566] [<ffffffffa03d1423>] ? wait_current_trans.isra.13+0x22/0xfc [btrfs]
[181631.224566] [<ffffffffa03f3fbc>] btrfs_relocate_chunk.isra.29+0x8f/0xa7 [btrfs]
[181631.224566] [<ffffffffa03f54df>] btrfs_balance+0xaa4/0xc52 [btrfs]
[181631.224566] [<ffffffffa03fd388>] btrfs_ioctl_balance+0x23f/0x2b0 [btrfs]
[181631.224566] [<ffffffff810872f9>] ? trace_hardirqs_on+0xd/0xf
[181631.224566] [<ffffffffa04019a3>] btrfs_ioctl+0xfe2/0x2220 [btrfs]
[181631.224566] [<ffffffff812603ed>] ? __this_cpu_preempt_check+0x13/0x15
[181631.224566] [<ffffffff81084669>] ? arch_local_irq_save+0x9/0xc
[181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2
[181631.224566] [<ffffffff81138def>] ? handle_mm_fault+0x834/0xcd2
[181631.224566] [<ffffffff8103e48c>] ? __do_page_fault+0x211/0x424
[181631.224566] [<ffffffff811755e6>] do_vfs_ioctl+0x3c6/0x479
(...)
The sequence of steps leading to this are:
CPU 0 CPU 1
btrfs_balance()
btrfs_relocate_chunk()
btrfs_relocate_block_group(bg X)
btrfs_lookup_block_group(bg X)
cleaner_kthread
locks fs_info->cleaner_mutex
btrfs_delete_unused_bgs()
finds bg X, which became
unused in the previous
transaction
checks bg X ->ro == 0,
so it proceeds
sets bg X ->ro to 1
(btrfs_set_block_group_ro(bg X))
blocks on fs_info->cleaner_mutex
btrfs_remove_chunk(bg X)
unlocks fs_info->cleaner_mutex
acquires fs_info->cleaner_mutex
relocate_block_group()
--> does nothing, no extents found in
the extent tree from bg X
unlocks fs_info->cleaner_mutex
btrfs_relocate_block_group(bg X) returns
btrfs_remove_chunk(bg X)
extent map not found
--> ASSERT(0)
Fix this by using a new mutex to make sure these 2 operations, block
group relocation and removal, are serialized.
This issue is reproducible by running fstests generic/038 (which stresses
chunk allocation and automatic removal of unused block groups) together
with the following balance loop:
while true; do btrfs balance start -dusage=0 <mountpoint> ; done
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-06-11 07:58:53 +08:00
|
|
|
|
2014-09-18 23:20:02 +08:00
|
|
|
/* Don't want to race with allocators so take the groups_sem */
|
|
|
|
down_write(&space_info->groups_sem);
|
|
|
|
spin_lock(&block_group->lock);
|
btrfs: Don't remove block group that still has pinned down bytes
[BUG]
Under certain KVM load and LTP tests, it is possible to hit the
following calltrace if quota is enabled:
BTRFS critical (device vda2): unable to find logical 8820195328 length 4096
BTRFS critical (device vda2): unable to find logical 8820195328 length 4096
WARNING: CPU: 0 PID: 49 at ../block/blk-core.c:172 blk_status_to_errno+0x1a/0x30
CPU: 0 PID: 49 Comm: kworker/u2:1 Not tainted 4.12.14-15-default #1 SLE15 (unreleased)
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
task: ffff9f827b340bc0 task.stack: ffffb4f8c0304000
RIP: 0010:blk_status_to_errno+0x1a/0x30
Call Trace:
submit_extent_page+0x191/0x270 [btrfs]
? btrfs_create_repair_bio+0x130/0x130 [btrfs]
__do_readpage+0x2d2/0x810 [btrfs]
? btrfs_create_repair_bio+0x130/0x130 [btrfs]
? run_one_async_done+0xc0/0xc0 [btrfs]
__extent_read_full_page+0xe7/0x100 [btrfs]
? run_one_async_done+0xc0/0xc0 [btrfs]
read_extent_buffer_pages+0x1ab/0x2d0 [btrfs]
? run_one_async_done+0xc0/0xc0 [btrfs]
btree_read_extent_buffer_pages+0x94/0xf0 [btrfs]
read_tree_block+0x31/0x60 [btrfs]
read_block_for_search.isra.35+0xf0/0x2e0 [btrfs]
btrfs_search_slot+0x46b/0xa00 [btrfs]
? kmem_cache_alloc+0x1a8/0x510
? btrfs_get_token_32+0x5b/0x120 [btrfs]
find_parent_nodes+0x11d/0xeb0 [btrfs]
? leaf_space_used+0xb8/0xd0 [btrfs]
? btrfs_leaf_free_space+0x49/0x90 [btrfs]
? btrfs_find_all_roots_safe+0x93/0x100 [btrfs]
btrfs_find_all_roots_safe+0x93/0x100 [btrfs]
btrfs_find_all_roots+0x45/0x60 [btrfs]
btrfs_qgroup_trace_extent_post+0x20/0x40 [btrfs]
btrfs_add_delayed_data_ref+0x1a3/0x1d0 [btrfs]
btrfs_alloc_reserved_file_extent+0x38/0x40 [btrfs]
insert_reserved_file_extent.constprop.71+0x289/0x2e0 [btrfs]
btrfs_finish_ordered_io+0x2f4/0x7f0 [btrfs]
? pick_next_task_fair+0x2cd/0x530
? __switch_to+0x92/0x4b0
btrfs_worker_helper+0x81/0x300 [btrfs]
process_one_work+0x1da/0x3f0
worker_thread+0x2b/0x3f0
? process_one_work+0x3f0/0x3f0
kthread+0x11a/0x130
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x35/0x40
BTRFS critical (device vda2): unable to find logical 8820195328 length 16384
BTRFS: error (device vda2) in btrfs_finish_ordered_io:3023: errno=-5 IO failure
BTRFS info (device vda2): forced readonly
BTRFS error (device vda2): pending csums is 2887680
[CAUSE]
It's caused by race with block group auto removal:
- There is a meta block group X, which has only one tree block
The tree block belongs to fs tree 257.
- In current transaction, some operation modified fs tree 257
The tree block gets COWed, so the block group X is empty, and marked
as unused, queued to be deleted.
- Some workload (like fsync) wakes up cleaner_kthread()
Which will call btrfs_delete_unused_bgs() to remove unused block
groups.
So block group X along its chunk map get removed.
- Some delalloc work finished for fs tree 257
Quota needs to get the original reference of the extent, which will
read tree blocks of commit root of 257.
Then since the chunk map gets removed, the above warning gets
triggered.
[FIX]
Just let btrfs_delete_unused_bgs() skip block group which still has
pinned bytes.
However there is a minor side effect: currently we only queue empty
blocks at update_block_group(), and such empty block group with pinned
bytes won't go through update_block_group() again, such block group
won't be removed, until it gets new extent allocated and removed.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-06-22 12:35:00 +08:00
|
|
|
if (block_group->reserved || block_group->pinned ||
|
2014-09-18 23:20:02 +08:00
|
|
|
btrfs_block_group_used(&block_group->item) ||
|
2016-10-11 04:43:31 +08:00
|
|
|
block_group->ro ||
|
2015-09-29 21:03:54 +08:00
|
|
|
list_is_singular(&block_group->list)) {
|
2014-09-18 23:20:02 +08:00
|
|
|
/*
|
|
|
|
* We want to bail if we made new allocations or have
|
|
|
|
* outstanding allocations in this block group. We do
|
|
|
|
* the ro check in case balance is currently acting on
|
|
|
|
* this block group.
|
|
|
|
*/
|
2018-04-26 17:17:20 +08:00
|
|
|
trace_btrfs_skip_unused_block_group(block_group);
|
2014-09-18 23:20:02 +08:00
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
up_write(&space_info->groups_sem);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
|
|
|
|
/* We don't want to force the issue, only flip if it's ok. */
|
2015-08-05 16:43:27 +08:00
|
|
|
ret = inc_block_group_ro(block_group, 0);
|
2014-09-18 23:20:02 +08:00
|
|
|
up_write(&space_info->groups_sem);
|
|
|
|
if (ret < 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Want to do this before we do anything else so we can recover
|
|
|
|
* properly if we fail to join the transaction.
|
|
|
|
*/
|
Btrfs: fix the number of transaction units needed to remove a block group
We were using only 1 transaction unit when attempting to delete an unused
block group but in reality we need 3 + N units, where N corresponds to the
number of stripes. We were accounting only for the addition of the orphan
item (for the block group's free space cache inode) but we were not
accounting that we need to delete one block group item from the extent
tree, one free space item from the tree of tree roots and N device extent
items from the device tree.
While one unit is not enough, it worked most of the time because for each
single unit we are too pessimistic and assume an entire tree path, with
the highest possible heigth (8), needs to be COWed with eventual node
splits at every possible level in the tree, so there was usually enough
reserved space for removing all the items and adding the orphan item.
However after adding the orphan item, writepages() can by called by the VM
subsystem against the btree inode when we are under memory pressure, which
causes writeback to start for the nodes we COWed before, this forces the
operation to remove the free space item to COW again some (or all of) the
same nodes (in the tree of tree roots). Even without writepages() being
called, we could fail with ENOSPC because these items are located in
multiple trees and one of them might have a higher heigth and require
node/leaf splits at many levels, exhausting all the reserved space before
removing all the items and adding the orphan.
In the kernel 4.0 release, commit 3d84be799194 ("Btrfs: fix BUG_ON in
btrfs_orphan_add() when delete unused block group"), we attempted to fix
a BUG_ON due to ENOSPC when trying to add the orphan item by making the
cleaner kthread reserve one transaction unit before attempting to remove
the block group, but this was not enough. We had a couple user reports
still hitting the same BUG_ON after 4.0, like Stefan Priebe's report on
a 4.2-rc6 kernel for example:
http://www.spinics.net/lists/linux-btrfs/msg46070.html
So fix this by reserving all the necessary units of metadata.
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Fixes: 3d84be799194 ("Btrfs: fix BUG_ON in btrfs_orphan_add() when delete unused block group")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-11-14 07:57:17 +08:00
|
|
|
trans = btrfs_start_trans_remove_block_group(fs_info,
|
|
|
|
block_group->key.objectid);
|
2014-09-18 23:20:02 +08:00
|
|
|
if (IS_ERR(trans)) {
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_dec_block_group_ro(block_group);
|
2014-09-18 23:20:02 +08:00
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We could have pending pinned extents for this block group,
|
|
|
|
* just delete them, we don't care about them anymore.
|
|
|
|
*/
|
|
|
|
start = block_group->key.objectid;
|
|
|
|
end = start + block_group->key.offset - 1;
|
2015-01-30 03:18:25 +08:00
|
|
|
/*
|
|
|
|
* Hold the unused_bg_unpin_mutex lock to avoid racing with
|
|
|
|
* btrfs_finish_extent_commit(). If we are at transaction N,
|
|
|
|
* another task might be running finish_extent_commit() for the
|
|
|
|
* previous transaction N - 1, and have seen a range belonging
|
|
|
|
* to the block group in freed_extents[] before we were able to
|
|
|
|
* clear the whole block group range from freed_extents[]. This
|
|
|
|
* means that task can lookup for the block group after we
|
|
|
|
* unpinned it from freed_extents[] and removed it, leading to
|
|
|
|
* a BUG_ON() at btrfs_unpin_extent_range().
|
|
|
|
*/
|
|
|
|
mutex_lock(&fs_info->unused_bg_unpin_mutex);
|
Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).
The sequence of steps that lead to this:
1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
the goal of freeing empty block groups;
2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
transaction (or starts a new one if none is running) and attempts to
clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
and freed_extents[1] (of which one corresponds to fs_info->pinned_extents);
3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
-ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
to delete the block group and the respective chunk, while pinned_extents
remains with that bit set for the whole (or a part of the) range covered
by the block group;
4) Later while the transaction is still running, the chunk ends up being reused
for a new block group (maybe for different purpose, data or metadata), and
extents belonging to the new block group are allocated for file data or btree
nodes/leafs;
5) The current transaction is committed, meaning that we unpinned one or more
extents from the new block group (through btrfs_finish_extent_commit() and
unpin_extent_range()) which are now being used for new file data or new
metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
And unpinning means we returned the extents to the free space cache of the
new block group, which implies those extents can be used for future allocations
while they're still in use.
Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache
object in unpin_extent_range() if a new block group didn't end up being allocated for
the same chunk (step 4 above).
Fix this by not freeing the block group and chunk if we fail to clear the dirty bit.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-03 22:08:39 +08:00
|
|
|
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
|
2016-04-27 05:54:39 +08:00
|
|
|
EXTENT_DIRTY);
|
Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).
The sequence of steps that lead to this:
1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
the goal of freeing empty block groups;
2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
transaction (or starts a new one if none is running) and attempts to
clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
and freed_extents[1] (of which one corresponds to fs_info->pinned_extents);
3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
-ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
to delete the block group and the respective chunk, while pinned_extents
remains with that bit set for the whole (or a part of the) range covered
by the block group;
4) Later while the transaction is still running, the chunk ends up being reused
for a new block group (maybe for different purpose, data or metadata), and
extents belonging to the new block group are allocated for file data or btree
nodes/leafs;
5) The current transaction is committed, meaning that we unpinned one or more
extents from the new block group (through btrfs_finish_extent_commit() and
unpin_extent_range()) which are now being used for new file data or new
metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
And unpinning means we returned the extents to the free space cache of the
new block group, which implies those extents can be used for future allocations
while they're still in use.
Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache
object in unpin_extent_range() if a new block group didn't end up being allocated for
the same chunk (step 4 above).
Fix this by not freeing the block group and chunk if we fail to clear the dirty bit.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-03 22:08:39 +08:00
|
|
|
if (ret) {
|
2015-01-30 03:18:25 +08:00
|
|
|
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_dec_block_group_ro(block_group);
|
Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).
The sequence of steps that lead to this:
1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
the goal of freeing empty block groups;
2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
transaction (or starts a new one if none is running) and attempts to
clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
and freed_extents[1] (of which one corresponds to fs_info->pinned_extents);
3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
-ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
to delete the block group and the respective chunk, while pinned_extents
remains with that bit set for the whole (or a part of the) range covered
by the block group;
4) Later while the transaction is still running, the chunk ends up being reused
for a new block group (maybe for different purpose, data or metadata), and
extents belonging to the new block group are allocated for file data or btree
nodes/leafs;
5) The current transaction is committed, meaning that we unpinned one or more
extents from the new block group (through btrfs_finish_extent_commit() and
unpin_extent_range()) which are now being used for new file data or new
metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
And unpinning means we returned the extents to the free space cache of the
new block group, which implies those extents can be used for future allocations
while they're still in use.
Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache
object in unpin_extent_range() if a new block group didn't end up being allocated for
the same chunk (step 4 above).
Fix this by not freeing the block group and chunk if we fail to clear the dirty bit.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-03 22:08:39 +08:00
|
|
|
goto end_trans;
|
|
|
|
}
|
|
|
|
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
|
2016-04-27 05:54:39 +08:00
|
|
|
EXTENT_DIRTY);
|
Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).
The sequence of steps that lead to this:
1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
the goal of freeing empty block groups;
2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
transaction (or starts a new one if none is running) and attempts to
clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
and freed_extents[1] (of which one corresponds to fs_info->pinned_extents);
3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
-ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
to delete the block group and the respective chunk, while pinned_extents
remains with that bit set for the whole (or a part of the) range covered
by the block group;
4) Later while the transaction is still running, the chunk ends up being reused
for a new block group (maybe for different purpose, data or metadata), and
extents belonging to the new block group are allocated for file data or btree
nodes/leafs;
5) The current transaction is committed, meaning that we unpinned one or more
extents from the new block group (through btrfs_finish_extent_commit() and
unpin_extent_range()) which are now being used for new file data or new
metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
And unpinning means we returned the extents to the free space cache of the
new block group, which implies those extents can be used for future allocations
while they're still in use.
Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache
object in unpin_extent_range() if a new block group didn't end up being allocated for
the same chunk (step 4 above).
Fix this by not freeing the block group and chunk if we fail to clear the dirty bit.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-03 22:08:39 +08:00
|
|
|
if (ret) {
|
2015-01-30 03:18:25 +08:00
|
|
|
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
|
2016-06-23 06:54:24 +08:00
|
|
|
btrfs_dec_block_group_ro(block_group);
|
Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).
The sequence of steps that lead to this:
1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
the goal of freeing empty block groups;
2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
transaction (or starts a new one if none is running) and attempts to
clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
and freed_extents[1] (of which one corresponds to fs_info->pinned_extents);
3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
-ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
to delete the block group and the respective chunk, while pinned_extents
remains with that bit set for the whole (or a part of the) range covered
by the block group;
4) Later while the transaction is still running, the chunk ends up being reused
for a new block group (maybe for different purpose, data or metadata), and
extents belonging to the new block group are allocated for file data or btree
nodes/leafs;
5) The current transaction is committed, meaning that we unpinned one or more
extents from the new block group (through btrfs_finish_extent_commit() and
unpin_extent_range()) which are now being used for new file data or new
metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
And unpinning means we returned the extents to the free space cache of the
new block group, which implies those extents can be used for future allocations
while they're still in use.
Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache
object in unpin_extent_range() if a new block group didn't end up being allocated for
the same chunk (step 4 above).
Fix this by not freeing the block group and chunk if we fail to clear the dirty bit.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-03 22:08:39 +08:00
|
|
|
goto end_trans;
|
|
|
|
}
|
2015-01-30 03:18:25 +08:00
|
|
|
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
|
2014-09-18 23:20:02 +08:00
|
|
|
|
|
|
|
/* Reset pinned so btrfs_put_block_group doesn't complain */
|
2015-02-25 14:17:20 +08:00
|
|
|
spin_lock(&space_info->lock);
|
|
|
|
spin_lock(&block_group->lock);
|
|
|
|
|
2019-06-19 04:09:21 +08:00
|
|
|
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
|
|
|
|
-block_group->pinned);
|
2015-02-25 14:17:20 +08:00
|
|
|
space_info->bytes_readonly += block_group->pinned;
|
btrfs: use customized batch size for total_bytes_pinned
In commit b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly
pinned bytes") we use total_bytes_pinned to track how many bytes we are
going to free in this transaction. When we are close to ENOSPC, we check it
and know if we can make the allocation by commit the current transaction.
For every data/metadata extent we are going to free, we add
total_bytes_pinned in btrfs_free_extent() and btrfs_free_tree_block(), and
release it in unpin_extent_range() when we finish the transaction. So this
is a variable we frequently update but rarely read - just the suitable
use of percpu_counter. But in previous commit we update total_bytes_pinned
by default 32 batch size, making every update essentially a spin lock
protected update. Since every spin lock/unlock operation involves syncing
a globally used variable and some kind of barrier in a SMP system, this is
more expensive than using total_bytes_pinned as a simple atomic64_t.
So fix this by using a customized batch size. Since we only read
total_bytes_pinned when we are close to ENOSPC and fail to allocate new
chunk, we can use a really large batch size and have nearly no penalty
in most cases.
[Test]
We tested the patch on a 4-cores x86 machine:
1. fallocate a 16GiB size test file
2. take snapshot (so all following writes will be COW)
3. run a 180 sec, 4 jobs, 4K random write fio on test file
We also added a temporary lockdep class on percpu_counter's spin lock
used by total_bytes_pinned to track it by lock_stat.
[Results]
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 82 82
0.21 0.61 29.46 0.36 298340
635973 0.09 11.01 173476.25 0.27
patched:
lock_stat version 0.4
-----------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
total_bytes_pinned_percpu: 1 1
0.62 0.62 0.62 0.62 13601
31542 0.14 9.61 11016.90 0.35
[Analysis]
Since the spin lock only protects a single in-memory variable, the
contentions (number of lock acquisitions that had to wait) in both
unpatched and patched version are low. But when we see acquisitions and
acq-bounces, we get much lower counts in patched version. Here the most
important metric is acq-bounces. It means how many times the lock gets
transferred between different cpus, so the patch can really reduce
cacheline bouncing of spin lock (also the global counter of percpu_counter)
in a SMP system.
Fixes: b150a4f10d878 ("Btrfs: use a percpu to keep track of possibly pinned bytes")
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-07-13 16:50:42 +08:00
|
|
|
percpu_counter_add_batch(&space_info->total_bytes_pinned,
|
|
|
|
-block_group->pinned,
|
|
|
|
BTRFS_TOTAL_BYTES_PINNED_BATCH);
|
2014-09-18 23:20:02 +08:00
|
|
|
block_group->pinned = 0;
|
|
|
|
|
2015-02-25 14:17:20 +08:00
|
|
|
spin_unlock(&block_group->lock);
|
|
|
|
spin_unlock(&space_info->lock);
|
|
|
|
|
2015-06-15 21:41:19 +08:00
|
|
|
/* DISCARD can flip during remount */
|
2016-06-23 06:54:23 +08:00
|
|
|
trimming = btrfs_test_opt(fs_info, DISCARD);
|
2015-06-15 21:41:19 +08:00
|
|
|
|
|
|
|
/* Implicit trim during transaction commit. */
|
|
|
|
if (trimming)
|
|
|
|
btrfs_get_block_group_trimming(block_group);
|
|
|
|
|
2014-09-18 23:20:02 +08:00
|
|
|
/*
|
|
|
|
* Btrfs_remove_chunk will abort the transaction if things go
|
|
|
|
* horribly wrong.
|
|
|
|
*/
|
2018-07-21 00:37:53 +08:00
|
|
|
ret = btrfs_remove_chunk(trans, block_group->key.objectid);
|
2015-06-15 21:41:19 +08:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
if (trimming)
|
|
|
|
btrfs_put_block_group_trimming(block_group);
|
|
|
|
goto end_trans;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're not mounted with -odiscard, we can just forget
|
|
|
|
* about this block group. Otherwise we'll need to wait
|
|
|
|
* until transaction commit to do the actual discard.
|
|
|
|
*/
|
|
|
|
if (trimming) {
|
2015-11-27 20:16:16 +08:00
|
|
|
spin_lock(&fs_info->unused_bgs_lock);
|
|
|
|
/*
|
|
|
|
* A concurrent scrub might have added us to the list
|
|
|
|
* fs_info->unused_bgs, so use a list_move operation
|
|
|
|
* to add the block group to the deleted_bgs list.
|
|
|
|
*/
|
2015-06-15 21:41:19 +08:00
|
|
|
list_move(&block_group->bg_list,
|
|
|
|
&trans->transaction->deleted_bgs);
|
2015-11-27 20:16:16 +08:00
|
|
|
spin_unlock(&fs_info->unused_bgs_lock);
|
2015-06-15 21:41:19 +08:00
|
|
|
btrfs_get_block_group(block_group);
|
|
|
|
}
|
Btrfs: fix freeing used extent after removing empty block group
Due to ignoring errors returned by clear_extent_bits (at the moment only
-ENOMEM is possible), we can end up freeing an extent that is actually in
use (i.e. return the extent to the free space cache).
The sequence of steps that lead to this:
1) Cleaner thread starts execution and calls btrfs_delete_unused_bgs(), with
the goal of freeing empty block groups;
2) btrfs_delete_unused_bgs() finds an empty block group, joins the current
transaction (or starts a new one if none is running) and attempts to
clear the EXTENT_DIRTY bit for the block group's range from freed_extents[0]
and freed_extents[1] (of which one corresponds to fs_info->pinned_extents);
3) Clearing the EXTENT_DIRTY bit (via clear_extent_bits()) fails with
-ENOMEM, but such error is ignored and btrfs_delete_unused_bgs() proceeds
to delete the block group and the respective chunk, while pinned_extents
remains with that bit set for the whole (or a part of the) range covered
by the block group;
4) Later while the transaction is still running, the chunk ends up being reused
for a new block group (maybe for different purpose, data or metadata), and
extents belonging to the new block group are allocated for file data or btree
nodes/leafs;
5) The current transaction is committed, meaning that we unpinned one or more
extents from the new block group (through btrfs_finish_extent_commit() and
unpin_extent_range()) which are now being used for new file data or new
metadata (through btrfs_finish_extent_commit() and unpin_extent_range()).
And unpinning means we returned the extents to the free space cache of the
new block group, which implies those extents can be used for future allocations
while they're still in use.
Alternatively, we can hit a BUG_ON() when doing a lookup for a block group's cache
object in unpin_extent_range() if a new block group didn't end up being allocated for
the same chunk (step 4 above).
Fix this by not freeing the block group and chunk if we fail to clear the dirty bit.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-03 22:08:39 +08:00
|
|
|
end_trans:
|
2016-09-10 09:39:03 +08:00
|
|
|
btrfs_end_transaction(trans);
|
2014-09-18 23:20:02 +08:00
|
|
|
next:
|
2015-10-08 18:46:44 +08:00
|
|
|
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
|
2014-09-18 23:20:02 +08:00
|
|
|
btrfs_put_block_group(block_group);
|
|
|
|
spin_lock(&fs_info->unused_bgs_lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->unused_bgs_lock);
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 start, u64 end)
|
2011-01-06 19:30:25 +08:00
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
return unpin_extent_range(fs_info, start, end, false);
|
2011-01-06 19:30:25 +08:00
|
|
|
}
|
|
|
|
|
2015-06-15 21:41:17 +08:00
|
|
|
/*
|
|
|
|
* It used to be that old block groups would be left around forever.
|
|
|
|
* Iterating over them would be enough to trim unused space. Since we
|
|
|
|
* now automatically remove them, we also need to iterate over unallocated
|
|
|
|
* space.
|
|
|
|
*
|
|
|
|
* We don't want a transaction for this since the discard may take a
|
|
|
|
* substantial amount of time. We don't require that a transaction be
|
|
|
|
* running, but we do need to take a running transaction into account
|
2018-09-07 05:18:16 +08:00
|
|
|
* to ensure that we're not discarding chunks that were released or
|
|
|
|
* allocated in the current transaction.
|
2015-06-15 21:41:17 +08:00
|
|
|
*
|
|
|
|
* Holding the chunks lock will prevent other threads from allocating
|
|
|
|
* or releasing chunks, but it won't prevent a running transaction
|
|
|
|
* from committing and releasing the memory that the pending chunks
|
|
|
|
* list head uses. For that, we need to take a reference to the
|
2018-09-07 05:18:16 +08:00
|
|
|
* transaction and hold the commit root sem. We only need to hold
|
|
|
|
* it while performing the free space search since we have already
|
|
|
|
* held back allocations.
|
2015-06-15 21:41:17 +08:00
|
|
|
*/
|
2019-06-03 18:06:00 +08:00
|
|
|
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
2015-06-15 21:41:17 +08:00
|
|
|
{
|
2019-06-03 18:06:00 +08:00
|
|
|
u64 start = SZ_1M, len = 0, end = 0;
|
2015-06-15 21:41:17 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
*trimmed = 0;
|
|
|
|
|
2018-09-07 05:18:15 +08:00
|
|
|
/* Discard not supported = nothing to do. */
|
|
|
|
if (!blk_queue_discard(bdev_get_queue(device->bdev)))
|
|
|
|
return 0;
|
|
|
|
|
2018-11-28 19:05:13 +08:00
|
|
|
/* Not writable = nothing to do. */
|
2017-12-04 12:54:52 +08:00
|
|
|
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
|
2015-06-15 21:41:17 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* No free space = nothing to do. */
|
|
|
|
if (device->total_bytes <= device->bytes_used)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
while (1) {
|
2016-06-23 06:54:56 +08:00
|
|
|
struct btrfs_fs_info *fs_info = device->fs_info;
|
2015-06-15 21:41:17 +08:00
|
|
|
u64 bytes;
|
|
|
|
|
|
|
|
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
|
|
|
|
if (ret)
|
2018-09-07 05:18:16 +08:00
|
|
|
break;
|
2015-06-15 21:41:17 +08:00
|
|
|
|
2019-03-27 20:24:18 +08:00
|
|
|
find_first_clear_extent_bit(&device->alloc_state, start,
|
|
|
|
&start, &end,
|
|
|
|
CHUNK_TRIMMED | CHUNK_ALLOCATED);
|
2019-06-03 18:06:01 +08:00
|
|
|
|
|
|
|
/* Ensure we skip the reserved area in the first 1M */
|
|
|
|
start = max_t(u64, start, SZ_1M);
|
|
|
|
|
2019-03-27 20:24:18 +08:00
|
|
|
/*
|
|
|
|
* If find_first_clear_extent_bit find a range that spans the
|
|
|
|
* end of the device it will set end to -1, in this case it's up
|
|
|
|
* to the caller to trim the value to the size of the device.
|
|
|
|
*/
|
|
|
|
end = min(end, device->total_bytes - 1);
|
2019-06-03 18:06:01 +08:00
|
|
|
|
2019-03-27 20:24:18 +08:00
|
|
|
len = end - start + 1;
|
2015-06-15 21:41:17 +08:00
|
|
|
|
2019-03-27 20:24:18 +08:00
|
|
|
/* We didn't find any extents */
|
|
|
|
if (!len) {
|
2015-06-15 21:41:17 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2019-03-27 20:24:18 +08:00
|
|
|
ret = 0;
|
2015-06-15 21:41:17 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-03-27 20:24:18 +08:00
|
|
|
ret = btrfs_issue_discard(device->bdev, start, len,
|
|
|
|
&bytes);
|
|
|
|
if (!ret)
|
|
|
|
set_extent_bits(&device->alloc_state, start,
|
|
|
|
start + bytes - 1,
|
|
|
|
CHUNK_TRIMMED);
|
2015-06-15 21:41:17 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
|
|
|
start += len;
|
|
|
|
*trimmed += bytes;
|
|
|
|
|
|
|
|
if (fatal_signal_pending(current)) {
|
|
|
|
ret = -ERESTARTSYS;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-09-07 14:16:23 +08:00
|
|
|
/*
|
|
|
|
* Trim the whole filesystem by:
|
|
|
|
* 1) trimming the free space in each block group
|
|
|
|
* 2) trimming the unallocated space on each device
|
|
|
|
*
|
|
|
|
* This will also continue trimming even if a block group or device encounters
|
|
|
|
* an error. The return value will be the last error, or 0 if nothing bad
|
|
|
|
* happens.
|
|
|
|
*/
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
|
2011-03-24 18:24:28 +08:00
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache = NULL;
|
2015-06-15 21:41:17 +08:00
|
|
|
struct btrfs_device *device;
|
|
|
|
struct list_head *devices;
|
2011-03-24 18:24:28 +08:00
|
|
|
u64 group_trimmed;
|
2019-05-28 16:21:54 +08:00
|
|
|
u64 range_end = U64_MAX;
|
2011-03-24 18:24:28 +08:00
|
|
|
u64 start;
|
|
|
|
u64 end;
|
|
|
|
u64 trimmed = 0;
|
2018-09-07 14:16:23 +08:00
|
|
|
u64 bg_failed = 0;
|
|
|
|
u64 dev_failed = 0;
|
|
|
|
int bg_ret = 0;
|
|
|
|
int dev_ret = 0;
|
2011-03-24 18:24:28 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2019-05-28 16:21:54 +08:00
|
|
|
/*
|
|
|
|
* Check range overflow if range->len is set.
|
|
|
|
* The default range->len is U64_MAX.
|
|
|
|
*/
|
|
|
|
if (range->len != U64_MAX &&
|
|
|
|
check_add_overflow(range->start, range->len, &range_end))
|
|
|
|
return -EINVAL;
|
|
|
|
|
btrfs: Ensure btrfs_trim_fs can trim the whole filesystem
[BUG]
fstrim on some btrfs only trims the unallocated space, not trimming any
space in existing block groups.
[CAUSE]
Before fstrim_range passed to btrfs_trim_fs(), it gets truncated to
range [0, super->total_bytes). So later btrfs_trim_fs() will only be
able to trim block groups in range [0, super->total_bytes).
While for btrfs, any bytenr aligned to sectorsize is valid, since btrfs
uses its logical address space, there is nothing limiting the location
where we put block groups.
For filesystem with frequent balance, it's quite easy to relocate all
block groups and bytenr of block groups will start beyond
super->total_bytes.
In that case, btrfs will not trim existing block groups.
[FIX]
Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs()
can get the unmodified range, which is normally set to [0, U64_MAX].
Reported-by: Chris Murphy <lists@colorremedies.com>
Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl")
CC: <stable@vger.kernel.org> # v4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-09-07 14:16:24 +08:00
|
|
|
cache = btrfs_lookup_first_block_group(fs_info, range->start);
|
2019-03-20 19:01:07 +08:00
|
|
|
for (; cache; cache = next_block_group(cache)) {
|
2019-05-28 16:21:54 +08:00
|
|
|
if (cache->key.objectid >= range_end) {
|
2011-03-24 18:24:28 +08:00
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
start = max(range->start, cache->key.objectid);
|
2019-05-28 16:21:54 +08:00
|
|
|
end = min(range_end, cache->key.objectid + cache->key.offset);
|
2011-03-24 18:24:28 +08:00
|
|
|
|
|
|
|
if (end - start >= range->minlen) {
|
|
|
|
if (!block_group_cache_done(cache)) {
|
2012-12-27 17:01:18 +08:00
|
|
|
ret = cache_block_group(cache, 0);
|
2013-06-13 01:56:06 +08:00
|
|
|
if (ret) {
|
2018-09-07 14:16:23 +08:00
|
|
|
bg_failed++;
|
|
|
|
bg_ret = ret;
|
|
|
|
continue;
|
2013-06-13 01:56:06 +08:00
|
|
|
}
|
|
|
|
ret = wait_block_group_cache_done(cache);
|
|
|
|
if (ret) {
|
2018-09-07 14:16:23 +08:00
|
|
|
bg_failed++;
|
|
|
|
bg_ret = ret;
|
|
|
|
continue;
|
2013-06-13 01:56:06 +08:00
|
|
|
}
|
2011-03-24 18:24:28 +08:00
|
|
|
}
|
|
|
|
ret = btrfs_trim_block_group(cache,
|
|
|
|
&group_trimmed,
|
|
|
|
start,
|
|
|
|
end,
|
|
|
|
range->minlen);
|
|
|
|
|
|
|
|
trimmed += group_trimmed;
|
|
|
|
if (ret) {
|
2018-09-07 14:16:23 +08:00
|
|
|
bg_failed++;
|
|
|
|
bg_ret = ret;
|
|
|
|
continue;
|
2011-03-24 18:24:28 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-07 14:16:23 +08:00
|
|
|
if (bg_failed)
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"failed to trim %llu block group(s), last error %d",
|
|
|
|
bg_failed, bg_ret);
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
2018-09-07 05:18:14 +08:00
|
|
|
devices = &fs_info->fs_devices->devices;
|
|
|
|
list_for_each_entry(device, devices, dev_list) {
|
2019-06-03 18:06:00 +08:00
|
|
|
ret = btrfs_trim_free_extents(device, &group_trimmed);
|
2018-09-07 14:16:23 +08:00
|
|
|
if (ret) {
|
|
|
|
dev_failed++;
|
|
|
|
dev_ret = ret;
|
2015-06-15 21:41:17 +08:00
|
|
|
break;
|
2018-09-07 14:16:23 +08:00
|
|
|
}
|
2015-06-15 21:41:17 +08:00
|
|
|
|
|
|
|
trimmed += group_trimmed;
|
|
|
|
}
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
2015-06-15 21:41:17 +08:00
|
|
|
|
2018-09-07 14:16:23 +08:00
|
|
|
if (dev_failed)
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"failed to trim %llu device(s), last error %d",
|
|
|
|
dev_failed, dev_ret);
|
2011-03-24 18:24:28 +08:00
|
|
|
range->len = trimmed;
|
2018-09-07 14:16:23 +08:00
|
|
|
if (bg_ret)
|
|
|
|
return bg_ret;
|
|
|
|
return dev_ret;
|
2011-03-24 18:24:28 +08:00
|
|
|
}
|
2014-03-06 13:38:19 +08:00
|
|
|
|
|
|
|
/*
|
2017-06-22 08:19:11 +08:00
|
|
|
* btrfs_{start,end}_write_no_snapshotting() are similar to
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
* mnt_{want,drop}_write(), they are used to prevent some tasks from writing
|
|
|
|
* data into the page cache through nocow before the subvolume is snapshoted,
|
|
|
|
* but flush the data into disk after the snapshot creation, or to prevent
|
2017-06-22 08:19:11 +08:00
|
|
|
* operations while snapshotting is ongoing and that cause the snapshot to be
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
* inconsistent (writes followed by expanding truncates for example).
|
2014-03-06 13:38:19 +08:00
|
|
|
*/
|
2017-06-22 08:19:11 +08:00
|
|
|
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
|
2014-03-06 13:38:19 +08:00
|
|
|
{
|
|
|
|
percpu_counter_dec(&root->subv_writers->counter);
|
2018-02-26 23:15:17 +08:00
|
|
|
cond_wake_up(&root->subv_writers->wait);
|
2014-03-06 13:38:19 +08:00
|
|
|
}
|
|
|
|
|
2017-06-22 08:19:11 +08:00
|
|
|
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
|
2014-03-06 13:38:19 +08:00
|
|
|
{
|
2017-06-22 08:19:11 +08:00
|
|
|
if (atomic_read(&root->will_be_snapshotted))
|
2014-03-06 13:38:19 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
percpu_counter_inc(&root->subv_writers->counter);
|
|
|
|
/*
|
|
|
|
* Make sure counter is updated before we check for snapshot creation.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
2017-06-22 08:19:11 +08:00
|
|
|
if (atomic_read(&root->will_be_snapshotted)) {
|
|
|
|
btrfs_end_write_no_snapshotting(root);
|
2014-03-06 13:38:19 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
2016-01-06 18:56:36 +08:00
|
|
|
|
|
|
|
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
while (true) {
|
|
|
|
int ret;
|
|
|
|
|
2017-06-22 08:19:11 +08:00
|
|
|
ret = btrfs_start_write_no_snapshotting(root);
|
2016-01-06 18:56:36 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
2018-03-15 18:43:08 +08:00
|
|
|
wait_var_event(&root->will_be_snapshotted,
|
|
|
|
!atomic_read(&root->will_be_snapshotted));
|
2016-01-06 18:56:36 +08:00
|
|
|
}
|
|
|
|
}
|
2018-05-22 16:43:47 +08:00
|
|
|
|
|
|
|
void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = bg->fs_info;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->unused_bgs_lock);
|
|
|
|
if (list_empty(&bg->bg_list)) {
|
|
|
|
btrfs_get_block_group(bg);
|
|
|
|
trace_btrfs_add_unused_block_group(bg);
|
|
|
|
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->unused_bgs_lock);
|
|
|
|
}
|