2007-06-12 21:07:21 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public
|
|
|
|
* License along with this program; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 021110-1307, USA.
|
|
|
|
*/
|
|
|
|
|
2008-04-26 04:53:30 +08:00
|
|
|
#include <linux/kernel.h>
|
2008-02-21 01:07:25 +08:00
|
|
|
#include <linux/bio.h>
|
2007-06-12 18:35:45 +08:00
|
|
|
#include <linux/buffer_head.h>
|
2008-05-03 02:43:14 +08:00
|
|
|
#include <linux/file.h>
|
2007-06-12 18:35:45 +08:00
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/backing-dev.h>
|
|
|
|
#include <linux/mpage.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/statfs.h>
|
|
|
|
#include <linux/compat.h>
|
2007-06-16 01:50:00 +08:00
|
|
|
#include <linux/bit_spinlock.h>
|
2007-11-17 00:45:54 +08:00
|
|
|
#include <linux/xattr.h>
|
2008-07-25 00:16:36 +08:00
|
|
|
#include <linux/posix_acl.h>
|
2008-10-31 02:25:28 +08:00
|
|
|
#include <linux/falloc.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2011-05-06 21:33:15 +08:00
|
|
|
#include <linux/ratelimit.h>
|
2011-11-30 23:45:38 +08:00
|
|
|
#include <linux/mount.h>
|
2013-01-29 14:04:50 +08:00
|
|
|
#include <linux/btrfs.h>
|
2013-01-30 07:40:14 +08:00
|
|
|
#include <linux/blkdev.h>
|
2013-06-19 22:16:26 +08:00
|
|
|
#include <linux/posix_acl_xattr.h>
|
2015-02-23 00:58:50 +08:00
|
|
|
#include <linux/uio.h>
|
2007-06-12 18:35:45 +08:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "btrfs_inode.h"
|
|
|
|
#include "print-tree.h"
|
2008-07-18 00:53:50 +08:00
|
|
|
#include "ordered-data.h"
|
2008-08-28 18:21:17 +08:00
|
|
|
#include "xattr.h"
|
2008-09-06 04:13:11 +08:00
|
|
|
#include "tree-log.h"
|
2011-07-22 21:41:52 +08:00
|
|
|
#include "volumes.h"
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
#include "compression.h"
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 22:25:08 +08:00
|
|
|
#include "locking.h"
|
2011-01-29 06:05:48 +08:00
|
|
|
#include "free-space-cache.h"
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
#include "inode-map.h"
|
2013-01-29 11:18:40 +08:00
|
|
|
#include "backref.h"
|
2013-06-19 22:16:26 +08:00
|
|
|
#include "hash.h"
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
#include "props.h"
|
2014-12-12 16:44:35 +08:00
|
|
|
#include "qgroup.h"
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
struct btrfs_iget_args {
|
2014-01-10 09:28:00 +08:00
|
|
|
struct btrfs_key *location;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_root *root;
|
|
|
|
};
|
|
|
|
|
2015-12-09 03:23:20 +08:00
|
|
|
struct btrfs_dio_data {
|
|
|
|
u64 outstanding_extents;
|
|
|
|
u64 reserve;
|
|
|
|
u64 unsubmitted_oe_range_start;
|
|
|
|
u64 unsubmitted_oe_range_end;
|
|
|
|
};
|
|
|
|
|
2009-09-22 08:01:11 +08:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_special_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_file_inode_operations;
|
2009-09-22 08:01:10 +08:00
|
|
|
static const struct address_space_operations btrfs_aops;
|
|
|
|
static const struct address_space_operations btrfs_symlink_aops;
|
2009-10-02 06:43:56 +08:00
|
|
|
static const struct file_operations btrfs_dir_file_operations;
|
2015-11-19 18:42:28 +08:00
|
|
|
static const struct extent_io_ops btrfs_extent_io_ops;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
static struct kmem_cache *btrfs_inode_cachep;
|
|
|
|
struct kmem_cache *btrfs_trans_handle_cachep;
|
|
|
|
struct kmem_cache *btrfs_transaction_cachep;
|
|
|
|
struct kmem_cache *btrfs_path_cachep;
|
2011-01-29 06:05:48 +08:00
|
|
|
struct kmem_cache *btrfs_free_space_cachep;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
#define S_SHIFT 12
|
2015-11-19 18:42:31 +08:00
|
|
|
static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
|
2007-06-12 18:35:45 +08:00
|
|
|
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
|
|
|
|
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
|
|
|
|
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
|
|
|
|
[S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
|
|
|
|
[S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
|
|
|
|
[S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
|
|
|
|
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
|
|
|
|
};
|
|
|
|
|
2013-01-12 10:57:22 +08:00
|
|
|
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
|
2011-02-01 04:30:16 +08:00
|
|
|
static int btrfs_truncate(struct inode *inode);
|
2012-05-03 02:00:54 +08:00
|
|
|
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
|
2008-11-07 11:02:51 +08:00
|
|
|
static noinline int cow_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written, int unlock);
|
2012-10-12 04:54:30 +08:00
|
|
|
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
|
|
|
|
u64 len, u64 orig_start,
|
|
|
|
u64 block_start, u64 block_len,
|
2013-04-05 02:31:27 +08:00
|
|
|
u64 orig_block_len, u64 ram_bytes,
|
|
|
|
int type);
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2013-04-26 04:41:01 +08:00
|
|
|
static int btrfs_dirty_inode(struct inode *inode);
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2015-03-17 05:38:52 +08:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
void btrfs_test_inode_set_ops(struct inode *inode)
|
|
|
|
{
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-11-12 17:35:27 +08:00
|
|
|
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
|
2011-02-02 00:05:39 +08:00
|
|
|
struct inode *inode, struct inode *dir,
|
|
|
|
const struct qstr *qstr)
|
2009-02-04 22:29:13 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2009-11-12 17:35:27 +08:00
|
|
|
err = btrfs_init_acl(trans, inode, dir);
|
2009-02-04 22:29:13 +08:00
|
|
|
if (!err)
|
2011-02-02 00:05:39 +08:00
|
|
|
err = btrfs_xattr_security_init(trans, inode, dir, qstr);
|
2009-02-04 22:29:13 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
/*
|
|
|
|
* this does all the hard work for inserting an inline extent into
|
|
|
|
* the btree. The caller should have done a btrfs_drop_extents so that
|
|
|
|
* no overlapping inline items exist in the btree
|
|
|
|
*/
|
2014-05-22 04:35:51 +08:00
|
|
|
static int insert_inline_extent(struct btrfs_trans_handle *trans,
|
2014-01-07 19:42:27 +08:00
|
|
|
struct btrfs_path *path, int extent_inserted,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
struct btrfs_root *root, struct inode *inode,
|
|
|
|
u64 start, size_t size, size_t compressed_size,
|
2011-03-28 16:30:38 +08:00
|
|
|
int compress_type,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
struct page **compressed_pages)
|
|
|
|
{
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct page *page = NULL;
|
|
|
|
char *kaddr;
|
|
|
|
unsigned long ptr;
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
int err = 0;
|
|
|
|
int ret;
|
|
|
|
size_t cur_size = size;
|
|
|
|
unsigned long offset;
|
|
|
|
|
2011-03-28 16:30:38 +08:00
|
|
|
if (compressed_size && compressed_pages)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
cur_size = compressed_size;
|
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
inode_add_bytes(inode, size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
if (!extent_inserted) {
|
|
|
|
struct btrfs_key key;
|
|
|
|
size_t datasize;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
key.objectid = btrfs_ino(inode);
|
|
|
|
key.offset = start;
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
datasize = btrfs_file_extent_calc_inline_size(cur_size);
|
|
|
|
path->leave_spinning = 1;
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
datasize);
|
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto fail;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
|
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
|
|
|
|
2010-12-17 14:21:50 +08:00
|
|
|
if (compress_type != BTRFS_COMPRESS_NONE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
struct page *cpage;
|
|
|
|
int i = 0;
|
2009-01-06 10:25:51 +08:00
|
|
|
while (compressed_size > 0) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
cpage = compressed_pages[i];
|
2008-11-11 22:34:41 +08:00
|
|
|
cur_size = min_t(unsigned long, compressed_size,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
PAGE_CACHE_SIZE);
|
|
|
|
|
2011-11-25 23:14:28 +08:00
|
|
|
kaddr = kmap_atomic(cpage);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
write_extent_buffer(leaf, kaddr, ptr, cur_size);
|
2011-11-25 23:14:28 +08:00
|
|
|
kunmap_atomic(kaddr);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
|
|
|
i++;
|
|
|
|
ptr += cur_size;
|
|
|
|
compressed_size -= cur_size;
|
|
|
|
}
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei,
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
} else {
|
|
|
|
page = find_get_page(inode->i_mapping,
|
|
|
|
start >> PAGE_CACHE_SHIFT);
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
2011-11-25 23:14:28 +08:00
|
|
|
kaddr = kmap_atomic(page);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
offset = start & (PAGE_CACHE_SIZE - 1);
|
|
|
|
write_extent_buffer(leaf, kaddr + offset, ptr, size);
|
2011-11-25 23:14:28 +08:00
|
|
|
kunmap_atomic(kaddr);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
page_cache_release(page);
|
|
|
|
}
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2014-01-07 19:42:27 +08:00
|
|
|
btrfs_release_path(path);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2009-11-12 17:34:21 +08:00
|
|
|
/*
|
|
|
|
* we're an inline extent, so nobody can
|
|
|
|
* extend the file past i_size without locking
|
|
|
|
* a page we already have locked.
|
|
|
|
*
|
|
|
|
* We must do any isize and inode updates
|
|
|
|
* before we unlock the pages. Otherwise we
|
|
|
|
* could end up racing with unlink.
|
|
|
|
*/
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
BTRFS_I(inode)->disk_i_size = inode->i_size;
|
2012-03-12 23:03:00 +08:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2009-11-12 17:34:21 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
fail:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* conditionally insert an inline extent into the file. This
|
|
|
|
* does the checks required to make sure the data is small enough
|
|
|
|
* to fit as an inline extent.
|
|
|
|
*/
|
2013-08-15 02:02:47 +08:00
|
|
|
static noinline int cow_file_range_inline(struct btrfs_root *root,
|
|
|
|
struct inode *inode, u64 start,
|
|
|
|
u64 end, size_t compressed_size,
|
|
|
|
int compress_type,
|
|
|
|
struct page **compressed_pages)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
{
|
2013-08-15 02:02:47 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
u64 isize = i_size_read(inode);
|
|
|
|
u64 actual_end = min(end + 1, isize);
|
|
|
|
u64 inline_len = actual_end - start;
|
2013-02-26 16:10:22 +08:00
|
|
|
u64 aligned_end = ALIGN(end, root->sectorsize);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
u64 data_len = inline_len;
|
|
|
|
int ret;
|
2014-01-07 19:42:27 +08:00
|
|
|
struct btrfs_path *path;
|
|
|
|
int extent_inserted = 0;
|
|
|
|
u32 extent_item_size;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
|
|
|
if (compressed_size)
|
|
|
|
data_len = compressed_size;
|
|
|
|
|
|
|
|
if (start > 0 ||
|
2014-07-17 11:44:11 +08:00
|
|
|
actual_end > PAGE_CACHE_SIZE ||
|
|
|
|
data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
(!compressed_size &&
|
|
|
|
(actual_end & (root->sectorsize - 1)) == 0) ||
|
|
|
|
end + 1 < isize ||
|
|
|
|
data_len > root->fs_info->max_inline) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-08-15 02:02:47 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2014-01-07 19:42:27 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
btrfs_free_path(path);
|
2013-08-15 02:02:47 +08:00
|
|
|
return PTR_ERR(trans);
|
2014-01-07 19:42:27 +08:00
|
|
|
}
|
2013-08-15 02:02:47 +08:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
if (compressed_size && compressed_pages)
|
|
|
|
extent_item_size = btrfs_file_extent_calc_inline_size(
|
|
|
|
compressed_size);
|
|
|
|
else
|
|
|
|
extent_item_size = btrfs_file_extent_calc_inline_size(
|
|
|
|
inline_len);
|
|
|
|
|
|
|
|
ret = __btrfs_drop_extents(trans, root, inode, path,
|
|
|
|
start, aligned_end, NULL,
|
|
|
|
1, 1, extent_item_size, &extent_inserted);
|
2013-08-15 02:02:47 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
|
|
|
if (isize > actual_end)
|
|
|
|
inline_len = min_t(u64, isize, actual_end);
|
2014-01-07 19:42:27 +08:00
|
|
|
ret = insert_inline_extent(trans, path, extent_inserted,
|
|
|
|
root, inode, start,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
inline_len, compressed_size,
|
2011-03-28 16:30:38 +08:00
|
|
|
compress_type, compressed_pages);
|
2012-05-24 04:10:14 +08:00
|
|
|
if (ret && ret != -ENOSPC) {
|
2012-03-12 23:03:00 +08:00
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2013-08-15 02:02:47 +08:00
|
|
|
goto out;
|
2012-05-24 04:10:14 +08:00
|
|
|
} else if (ret == -ENOSPC) {
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = 1;
|
|
|
|
goto out;
|
2012-03-12 23:03:00 +08:00
|
|
|
}
|
2012-05-24 04:10:14 +08:00
|
|
|
|
2013-03-01 02:23:38 +08:00
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
|
2010-05-16 22:48:47 +08:00
|
|
|
btrfs_delalloc_release_metadata(inode, end + 1 - start);
|
2009-09-12 00:27:37 +08:00
|
|
|
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
|
2013-08-15 02:02:47 +08:00
|
|
|
out:
|
2015-09-08 17:25:56 +08:00
|
|
|
/*
|
|
|
|
* Don't forget to free the reserved space, as for inlined extent
|
|
|
|
* it won't count as data extent, free them directly here.
|
|
|
|
* And at reserve time, it's always aligned to page size, so
|
|
|
|
* just free one page here.
|
|
|
|
*/
|
|
|
|
btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
|
2014-01-07 19:42:27 +08:00
|
|
|
btrfs_free_path(path);
|
2013-08-15 02:02:47 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
}
|
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
struct async_extent {
|
|
|
|
u64 start;
|
|
|
|
u64 ram_size;
|
|
|
|
u64 compressed_size;
|
|
|
|
struct page **pages;
|
|
|
|
unsigned long nr_pages;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type;
|
2008-11-07 11:02:51 +08:00
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct async_cow {
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct page *locked_page;
|
|
|
|
u64 start;
|
|
|
|
u64 end;
|
|
|
|
struct list_head extents;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
|
|
|
static noinline int add_async_extent(struct async_cow *cow,
|
|
|
|
u64 start, u64 ram_size,
|
|
|
|
u64 compressed_size,
|
|
|
|
struct page **pages,
|
2010-12-17 14:21:50 +08:00
|
|
|
unsigned long nr_pages,
|
|
|
|
int compress_type)
|
2008-11-07 11:02:51 +08:00
|
|
|
{
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
|
|
|
|
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!async_extent); /* -ENOMEM */
|
2008-11-07 11:02:51 +08:00
|
|
|
async_extent->start = start;
|
|
|
|
async_extent->ram_size = ram_size;
|
|
|
|
async_extent->compressed_size = compressed_size;
|
|
|
|
async_extent->pages = pages;
|
|
|
|
async_extent->nr_pages = nr_pages;
|
2010-12-17 14:21:50 +08:00
|
|
|
async_extent->compress_type = compress_type;
|
2008-11-07 11:02:51 +08:00
|
|
|
list_add_tail(&async_extent->list, &cow->extents);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-07-17 11:44:09 +08:00
|
|
|
static inline int inode_need_compress(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
/* force compress */
|
|
|
|
if (btrfs_test_opt(root, FORCE_COMPRESS))
|
|
|
|
return 1;
|
|
|
|
/* bad compression ratios */
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
|
|
|
|
return 0;
|
|
|
|
if (btrfs_test_opt(root, COMPRESS) ||
|
|
|
|
BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
|
|
|
|
BTRFS_I(inode)->force_compress)
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
2008-11-07 11:02:51 +08:00
|
|
|
* we create compressed extents in two phases. The first
|
|
|
|
* phase compresses a range of pages that have already been
|
|
|
|
* locked (both pages and state bits are locked).
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
*
|
2008-11-07 11:02:51 +08:00
|
|
|
* This is done inside an ordered work queue, and the compression
|
|
|
|
* is spread across many cpus. The actual IO submission is step
|
|
|
|
* two, and the ordered work queue takes care of making sure that
|
|
|
|
* happens in the same order things were put onto the queue by
|
|
|
|
* writepages and friends.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
*
|
2008-11-07 11:02:51 +08:00
|
|
|
* If this code finds it can't get good compression, it puts an
|
|
|
|
* entry onto the work queue to write the uncompressed bytes. This
|
|
|
|
* makes sure that both compressed inodes and uncompressed inodes
|
2012-07-25 23:12:06 +08:00
|
|
|
* are written in the same order that the flusher thread sent them
|
|
|
|
* down.
|
2008-09-30 03:18:18 +08:00
|
|
|
*/
|
2014-10-10 04:15:44 +08:00
|
|
|
static noinline void compress_file_range(struct inode *inode,
|
2008-11-07 11:02:51 +08:00
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end,
|
|
|
|
struct async_cow *async_cow,
|
|
|
|
int *num_added)
|
2007-08-28 04:49:44 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2007-10-16 04:15:53 +08:00
|
|
|
u64 num_bytes;
|
|
|
|
u64 blocksize = root->sectorsize;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
u64 actual_end;
|
2008-12-16 00:44:56 +08:00
|
|
|
u64 isize = i_size_read(inode);
|
2008-07-18 00:53:50 +08:00
|
|
|
int ret = 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
struct page **pages = NULL;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
unsigned long nr_pages_ret = 0;
|
|
|
|
unsigned long total_compressed = 0;
|
|
|
|
unsigned long total_in = 0;
|
2015-12-15 00:42:10 +08:00
|
|
|
unsigned long max_compressed = SZ_128K;
|
|
|
|
unsigned long max_uncompressed = SZ_128K;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
int i;
|
|
|
|
int will_compress;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type = root->fs_info->compress_type;
|
2013-03-27 01:07:00 +08:00
|
|
|
int redirty = 0;
|
2007-08-28 04:49:44 +08:00
|
|
|
|
2012-03-29 21:57:45 +08:00
|
|
|
/* if this is a small write inside eof, kick off a defrag */
|
2015-12-15 00:42:10 +08:00
|
|
|
if ((end - start + 1) < SZ_16K &&
|
2012-03-29 21:57:45 +08:00
|
|
|
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
|
2011-05-25 03:35:30 +08:00
|
|
|
btrfs_add_inode_defrag(NULL, inode);
|
|
|
|
|
2008-12-16 00:44:56 +08:00
|
|
|
actual_end = min_t(u64, isize, end + 1);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
again:
|
|
|
|
will_compress = 0;
|
|
|
|
nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
|
2015-12-15 00:42:10 +08:00
|
|
|
nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
|
2007-12-18 09:14:01 +08:00
|
|
|
|
2009-02-04 22:31:06 +08:00
|
|
|
/*
|
|
|
|
* we don't want to send crud past the end of i_size through
|
|
|
|
* compression, that's just a waste of CPU time. So, if the
|
|
|
|
* end of the file is before the start of our current
|
|
|
|
* requested range of bytes, we bail out to the uncompressed
|
|
|
|
* cleanup code that can deal with all of this.
|
|
|
|
*
|
|
|
|
* It isn't really the fastest way to fix things, but this is a
|
|
|
|
* very uncommon corner.
|
|
|
|
*/
|
|
|
|
if (actual_end <= start)
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
total_compressed = actual_end - start;
|
|
|
|
|
2014-10-08 06:44:35 +08:00
|
|
|
/*
|
|
|
|
* skip compression for a small file range(<=blocksize) that
|
|
|
|
* isn't an inline extent, since it dosen't save disk space at all.
|
|
|
|
*/
|
|
|
|
if (total_compressed <= blocksize &&
|
|
|
|
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
/* we want to make sure that amount of ram required to uncompress
|
|
|
|
* an extent is reasonable, so we limit the total size in ram
|
2008-11-07 11:02:51 +08:00
|
|
|
* of a compressed extent to 128k. This is a crucial number
|
|
|
|
* because it also controls how easily we can spread reads across
|
|
|
|
* cpus for decompression.
|
|
|
|
*
|
|
|
|
* We also want to make sure the amount of IO required to do
|
|
|
|
* a random read is reasonably small, so we limit the size of
|
|
|
|
* a compressed extent to 128k.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
*/
|
|
|
|
total_compressed = min(total_compressed, max_uncompressed);
|
2013-02-26 16:10:22 +08:00
|
|
|
num_bytes = ALIGN(end - start + 1, blocksize);
|
2007-12-18 09:14:01 +08:00
|
|
|
num_bytes = max(blocksize, num_bytes);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
total_in = 0;
|
|
|
|
ret = 0;
|
2007-10-16 04:15:53 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
/*
|
|
|
|
* we do compression for mount -o compress and when the
|
|
|
|
* inode has not been flagged as nocompress. This flag can
|
|
|
|
* change at any time if we discover bad compression ratios.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
*/
|
2014-07-17 11:44:09 +08:00
|
|
|
if (inode_need_compress(inode)) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
WARN_ON(pages);
|
2015-02-21 01:00:26 +08:00
|
|
|
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
|
2011-09-08 10:22:01 +08:00
|
|
|
if (!pages) {
|
|
|
|
/* just bail out to the uncompressed code */
|
|
|
|
goto cont;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2010-12-17 14:21:50 +08:00
|
|
|
if (BTRFS_I(inode)->force_compress)
|
|
|
|
compress_type = BTRFS_I(inode)->force_compress;
|
|
|
|
|
2013-03-27 01:07:00 +08:00
|
|
|
/*
|
|
|
|
* we need to call clear_page_dirty_for_io on each
|
|
|
|
* page in the range. Otherwise applications with the file
|
|
|
|
* mmap'd can wander in and change the page contents while
|
|
|
|
* we are compressing them.
|
|
|
|
*
|
|
|
|
* If the compression fails for any reason, we set the pages
|
|
|
|
* dirty again later on.
|
|
|
|
*/
|
|
|
|
extent_range_clear_dirty_for_io(inode, start, end);
|
|
|
|
redirty = 1;
|
2010-12-17 14:21:50 +08:00
|
|
|
ret = btrfs_compress_pages(compress_type,
|
|
|
|
inode->i_mapping, start,
|
|
|
|
total_compressed, pages,
|
|
|
|
nr_pages, &nr_pages_ret,
|
|
|
|
&total_in,
|
|
|
|
&total_compressed,
|
|
|
|
max_compressed);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
unsigned long offset = total_compressed &
|
|
|
|
(PAGE_CACHE_SIZE - 1);
|
|
|
|
struct page *page = pages[nr_pages_ret - 1];
|
|
|
|
char *kaddr;
|
|
|
|
|
|
|
|
/* zero the tail end of the last page, we might be
|
|
|
|
* sending it down to disk
|
|
|
|
*/
|
|
|
|
if (offset) {
|
2011-11-25 23:14:28 +08:00
|
|
|
kaddr = kmap_atomic(page);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
memset(kaddr + offset, 0,
|
|
|
|
PAGE_CACHE_SIZE - offset);
|
2011-11-25 23:14:28 +08:00
|
|
|
kunmap_atomic(kaddr);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
}
|
|
|
|
will_compress = 1;
|
|
|
|
}
|
|
|
|
}
|
2011-09-08 10:22:01 +08:00
|
|
|
cont:
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
if (start == 0) {
|
|
|
|
/* lets try to make an inline extent */
|
2008-11-07 11:02:51 +08:00
|
|
|
if (ret || total_in < (actual_end - start)) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
/* we didn't compress the entire range, try
|
2008-11-07 11:02:51 +08:00
|
|
|
* to make an uncompressed inline extent.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
*/
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = cow_file_range_inline(root, inode, start, end,
|
|
|
|
0, 0, NULL);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
} else {
|
2008-11-07 11:02:51 +08:00
|
|
|
/* try making a compressed inline extent */
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = cow_file_range_inline(root, inode, start, end,
|
2011-03-28 16:30:38 +08:00
|
|
|
total_compressed,
|
|
|
|
compress_type, pages);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret <= 0) {
|
2013-07-30 01:22:24 +08:00
|
|
|
unsigned long clear_flags = EXTENT_DELALLOC |
|
|
|
|
EXTENT_DEFRAG;
|
2014-10-10 17:45:12 +08:00
|
|
|
unsigned long page_error_op;
|
|
|
|
|
2013-07-30 01:22:24 +08:00
|
|
|
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
|
2014-10-10 17:45:12 +08:00
|
|
|
page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
|
2013-07-30 01:22:24 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
/*
|
2012-03-12 23:03:00 +08:00
|
|
|
* inline extent creation worked or returned error,
|
|
|
|
* we don't need to create any more async work items.
|
|
|
|
* Unlock and free up our temp pages.
|
2008-11-07 11:02:51 +08:00
|
|
|
*/
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, NULL,
|
2013-07-30 01:22:24 +08:00
|
|
|
clear_flags, PAGE_UNLOCK |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_CLEAR_DIRTY |
|
|
|
|
PAGE_SET_WRITEBACK |
|
2014-10-10 17:45:12 +08:00
|
|
|
page_error_op |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_END_WRITEBACK);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
goto free_pages_out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (will_compress) {
|
|
|
|
/*
|
|
|
|
* we aren't doing an inline extent round the compressed size
|
|
|
|
* up to a block size boundary so the allocator does sane
|
|
|
|
* things
|
|
|
|
*/
|
2013-02-26 16:10:22 +08:00
|
|
|
total_compressed = ALIGN(total_compressed, blocksize);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* one last check to make sure the compression is really a
|
|
|
|
* win, compare the page count read with the blocks on disk
|
|
|
|
*/
|
2013-02-26 16:10:22 +08:00
|
|
|
total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
if (total_compressed >= total_in) {
|
|
|
|
will_compress = 0;
|
|
|
|
} else {
|
|
|
|
num_bytes = total_in;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!will_compress && pages) {
|
|
|
|
/*
|
|
|
|
* the compression code ran but failed to make things smaller,
|
|
|
|
* free any pages it allocated and our page pointer array
|
|
|
|
*/
|
|
|
|
for (i = 0; i < nr_pages_ret; i++) {
|
2008-11-01 00:46:39 +08:00
|
|
|
WARN_ON(pages[i]->mapping);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
page_cache_release(pages[i]);
|
|
|
|
}
|
|
|
|
kfree(pages);
|
|
|
|
pages = NULL;
|
|
|
|
total_compressed = 0;
|
|
|
|
nr_pages_ret = 0;
|
|
|
|
|
|
|
|
/* flag the file so we don't compress in the future */
|
2010-03-11 22:42:04 +08:00
|
|
|
if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
|
|
|
|
!(BTRFS_I(inode)->force_compress)) {
|
2010-01-29 05:18:15 +08:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
|
2010-03-11 22:42:04 +08:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
}
|
2008-11-07 11:02:51 +08:00
|
|
|
if (will_compress) {
|
|
|
|
*num_added += 1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
/* the async work queues will take care of doing actual
|
|
|
|
* allocation on disk for these compressed pages,
|
|
|
|
* and will submit them to the elevator.
|
|
|
|
*/
|
|
|
|
add_async_extent(async_cow, start, num_bytes,
|
2010-12-17 14:21:50 +08:00
|
|
|
total_compressed, pages, nr_pages_ret,
|
|
|
|
compress_type);
|
2007-11-01 23:28:41 +08:00
|
|
|
|
2010-12-06 15:02:36 +08:00
|
|
|
if (start + num_bytes < end) {
|
2008-11-07 11:02:51 +08:00
|
|
|
start += num_bytes;
|
|
|
|
pages = NULL;
|
|
|
|
cond_resched();
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
} else {
|
2009-02-04 22:31:06 +08:00
|
|
|
cleanup_and_bail_uncompressed:
|
2008-11-07 11:02:51 +08:00
|
|
|
/*
|
|
|
|
* No compression, but we still need to write the pages in
|
|
|
|
* the file we've been given so far. redirty the locked
|
|
|
|
* page if it corresponds to our extent and set things up
|
|
|
|
* for the async work queue to run cow_file_range to do
|
|
|
|
* the normal delalloc dance
|
|
|
|
*/
|
|
|
|
if (page_offset(locked_page) >= start &&
|
|
|
|
page_offset(locked_page) <= end) {
|
|
|
|
__set_page_dirty_nobuffers(locked_page);
|
|
|
|
/* unlocked later on in the async handlers */
|
|
|
|
}
|
2013-03-27 01:07:00 +08:00
|
|
|
if (redirty)
|
|
|
|
extent_range_redirty_for_io(inode, start, end);
|
2010-12-17 14:21:50 +08:00
|
|
|
add_async_extent(async_cow, start, end - start + 1,
|
|
|
|
0, NULL, 0, BTRFS_COMPRESS_NONE);
|
2008-11-07 11:02:51 +08:00
|
|
|
*num_added += 1;
|
|
|
|
}
|
2008-04-17 23:29:12 +08:00
|
|
|
|
2014-10-10 04:15:44 +08:00
|
|
|
return;
|
2008-11-07 11:02:51 +08:00
|
|
|
|
|
|
|
free_pages_out:
|
|
|
|
for (i = 0; i < nr_pages_ret; i++) {
|
|
|
|
WARN_ON(pages[i]->mapping);
|
|
|
|
page_cache_release(pages[i]);
|
|
|
|
}
|
2009-01-06 10:25:51 +08:00
|
|
|
kfree(pages);
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
|
|
|
|
2014-10-07 05:14:24 +08:00
|
|
|
static void free_async_extent_pages(struct async_extent *async_extent)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!async_extent->pages)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (i = 0; i < async_extent->nr_pages; i++) {
|
|
|
|
WARN_ON(async_extent->pages[i]->mapping);
|
|
|
|
page_cache_release(async_extent->pages[i]);
|
|
|
|
}
|
|
|
|
kfree(async_extent->pages);
|
|
|
|
async_extent->nr_pages = 0;
|
|
|
|
async_extent->pages = NULL;
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* phase two of compressed writeback. This is the ordered portion
|
|
|
|
* of the code, which only gets called in the order the work was
|
|
|
|
* queued. We walk all the async extents created by compress_file_range
|
|
|
|
* and send them down to the disk.
|
|
|
|
*/
|
2014-10-07 05:14:26 +08:00
|
|
|
static noinline void submit_compressed_extents(struct inode *inode,
|
2008-11-07 11:02:51 +08:00
|
|
|
struct async_cow *async_cow)
|
|
|
|
{
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_io_tree *io_tree;
|
2009-11-11 10:23:48 +08:00
|
|
|
int ret = 0;
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2013-02-07 05:49:15 +08:00
|
|
|
again:
|
2009-01-06 10:25:51 +08:00
|
|
|
while (!list_empty(&async_cow->extents)) {
|
2008-11-07 11:02:51 +08:00
|
|
|
async_extent = list_entry(async_cow->extents.next,
|
|
|
|
struct async_extent, list);
|
|
|
|
list_del(&async_extent->list);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
2009-11-11 10:23:48 +08:00
|
|
|
retry:
|
2008-11-07 11:02:51 +08:00
|
|
|
/* did the compression code fall back to uncompressed IO? */
|
|
|
|
if (!async_extent->pages) {
|
|
|
|
int page_started = 0;
|
|
|
|
unsigned long nr_written = 0;
|
|
|
|
|
|
|
|
lock_extent(io_tree, async_extent->start,
|
2010-02-04 03:33:23 +08:00
|
|
|
async_extent->start +
|
2012-03-01 21:57:19 +08:00
|
|
|
async_extent->ram_size - 1);
|
2008-11-07 11:02:51 +08:00
|
|
|
|
|
|
|
/* allocate blocks */
|
2009-11-11 10:23:48 +08:00
|
|
|
ret = cow_file_range(inode, async_cow->locked_page,
|
|
|
|
async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
|
|
|
&page_started, &nr_written, 0);
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
/* JDM XXX */
|
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
/*
|
|
|
|
* if page_started, cow_file_range inserted an
|
|
|
|
* inline extent and took care of all the unlocking
|
|
|
|
* and IO for us. Otherwise, we need to submit
|
|
|
|
* all those pages down to the drive.
|
|
|
|
*/
|
2009-11-11 10:23:48 +08:00
|
|
|
if (!page_started && !ret)
|
2008-11-07 11:02:51 +08:00
|
|
|
extent_write_locked_range(io_tree,
|
|
|
|
inode, async_extent->start,
|
2009-01-06 10:25:51 +08:00
|
|
|
async_extent->start +
|
2008-11-07 11:02:51 +08:00
|
|
|
async_extent->ram_size - 1,
|
|
|
|
btrfs_get_extent,
|
|
|
|
WB_SYNC_ALL);
|
2013-02-07 05:49:15 +08:00
|
|
|
else if (ret)
|
|
|
|
unlock_page(async_cow->locked_page);
|
2008-11-07 11:02:51 +08:00
|
|
|
kfree(async_extent);
|
|
|
|
cond_resched();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_extent(io_tree, async_extent->start,
|
2012-03-01 21:57:19 +08:00
|
|
|
async_extent->start + async_extent->ram_size - 1);
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = btrfs_reserve_extent(root,
|
2008-11-07 11:02:51 +08:00
|
|
|
async_extent->compressed_size,
|
|
|
|
async_extent->compressed_size,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
0, alloc_hint, &ins, 1, 1);
|
2009-11-11 10:23:48 +08:00
|
|
|
if (ret) {
|
2014-10-07 05:14:24 +08:00
|
|
|
free_async_extent_pages(async_extent);
|
2013-02-07 05:49:15 +08:00
|
|
|
|
2013-06-15 04:58:23 +08:00
|
|
|
if (ret == -ENOSPC) {
|
|
|
|
unlock_extent(io_tree, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1);
|
2014-07-24 22:48:05 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we need to redirty the pages if we decide to
|
|
|
|
* fallback to uncompressed IO, otherwise we
|
|
|
|
* will not submit these pages down to lower
|
|
|
|
* layers.
|
|
|
|
*/
|
|
|
|
extent_range_redirty_for_io(inode,
|
|
|
|
async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1);
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
goto retry;
|
2013-06-15 04:58:23 +08:00
|
|
|
}
|
2013-02-07 05:49:15 +08:00
|
|
|
goto out_free;
|
2009-11-11 10:23:48 +08:00
|
|
|
}
|
2009-11-12 17:34:21 +08:00
|
|
|
/*
|
|
|
|
* here we're doing allocation and writeback of the
|
|
|
|
* compressed pages
|
|
|
|
*/
|
|
|
|
btrfs_drop_extent_cache(inode, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, 0);
|
|
|
|
|
2011-04-21 06:48:27 +08:00
|
|
|
em = alloc_extent_map();
|
2013-05-14 10:12:15 +08:00
|
|
|
if (!em) {
|
|
|
|
ret = -ENOMEM;
|
2013-02-07 05:49:15 +08:00
|
|
|
goto out_free_reserve;
|
2013-05-14 10:12:15 +08:00
|
|
|
}
|
2008-11-07 11:02:51 +08:00
|
|
|
em->start = async_extent->start;
|
|
|
|
em->len = async_extent->ram_size;
|
2008-11-11 00:53:33 +08:00
|
|
|
em->orig_start = em->start;
|
2012-10-13 03:27:49 +08:00
|
|
|
em->mod_start = em->start;
|
|
|
|
em->mod_len = em->len;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
em->block_start = ins.objectid;
|
|
|
|
em->block_len = ins.offset;
|
2012-12-03 23:31:19 +08:00
|
|
|
em->orig_block_len = ins.offset;
|
2013-04-05 02:31:27 +08:00
|
|
|
em->ram_bytes = async_extent->ram_size;
|
2008-11-07 11:02:51 +08:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2010-12-17 14:21:50 +08:00
|
|
|
em->compress_type = async_extent->compress_type;
|
2008-11-07 11:02:51 +08:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
|
2012-10-12 04:54:30 +08:00
|
|
|
em->generation = -1;
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2009-09-03 04:24:52 +08:00
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
2009-09-03 04:24:52 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
2008-11-07 11:02:51 +08:00
|
|
|
if (ret != -EEXIST) {
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_drop_extent_cache(inode, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, 0);
|
|
|
|
}
|
|
|
|
|
2013-02-07 05:49:15 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_free_reserve;
|
|
|
|
|
2010-12-17 14:21:50 +08:00
|
|
|
ret = btrfs_add_ordered_extent_compress(inode,
|
|
|
|
async_extent->start,
|
|
|
|
ins.objectid,
|
|
|
|
async_extent->ram_size,
|
|
|
|
ins.offset,
|
|
|
|
BTRFS_ORDERED_COMPRESSED,
|
|
|
|
async_extent->compress_type);
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 17:43:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_drop_extent_cache(inode, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, 0);
|
2013-02-07 05:49:15 +08:00
|
|
|
goto out_free_reserve;
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 17:43:00 +08:00
|
|
|
}
|
2008-11-07 11:02:51 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* clear dirty, set writeback and unlock the pages.
|
|
|
|
*/
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, async_extent->start,
|
2009-10-08 23:27:10 +08:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2013-07-30 01:22:24 +08:00
|
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
|
|
|
|
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_SET_WRITEBACK);
|
2008-11-07 11:02:51 +08:00
|
|
|
ret = btrfs_submit_compressed_write(inode,
|
2009-01-06 10:25:51 +08:00
|
|
|
async_extent->start,
|
|
|
|
async_extent->ram_size,
|
|
|
|
ins.objectid,
|
|
|
|
ins.offset, async_extent->pages,
|
|
|
|
async_extent->nr_pages);
|
2014-10-07 05:14:23 +08:00
|
|
|
if (ret) {
|
|
|
|
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct page *p = async_extent->pages[0];
|
|
|
|
const u64 start = async_extent->start;
|
|
|
|
const u64 end = start + async_extent->ram_size - 1;
|
|
|
|
|
|
|
|
p->mapping = inode->i_mapping;
|
|
|
|
tree->ops->writepage_end_io_hook(p, start, end,
|
|
|
|
NULL, 0);
|
|
|
|
p->mapping = NULL;
|
|
|
|
extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
|
|
|
|
PAGE_END_WRITEBACK |
|
|
|
|
PAGE_SET_ERROR);
|
2014-10-07 05:14:24 +08:00
|
|
|
free_async_extent_pages(async_extent);
|
2014-10-07 05:14:23 +08:00
|
|
|
}
|
2008-11-07 11:02:51 +08:00
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
kfree(async_extent);
|
|
|
|
cond_resched();
|
|
|
|
}
|
2014-10-07 05:14:26 +08:00
|
|
|
return;
|
2013-02-07 05:49:15 +08:00
|
|
|
out_free_reserve:
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
out_free:
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, async_extent->start,
|
2013-02-07 05:49:15 +08:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
2013-07-29 23:20:47 +08:00
|
|
|
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
|
2013-07-30 01:22:24 +08:00
|
|
|
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
|
|
|
|
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
|
2014-10-07 05:14:22 +08:00
|
|
|
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
|
|
|
|
PAGE_SET_ERROR);
|
2014-10-07 05:14:24 +08:00
|
|
|
free_async_extent_pages(async_extent);
|
2012-03-12 23:03:00 +08:00
|
|
|
kfree(async_extent);
|
2013-02-07 05:49:15 +08:00
|
|
|
goto again;
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
|
|
|
|
u64 num_bytes)
|
|
|
|
{
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
|
|
|
|
read_lock(&em_tree->lock);
|
|
|
|
em = search_extent_mapping(em_tree, start, num_bytes);
|
|
|
|
if (em) {
|
|
|
|
/*
|
|
|
|
* if block start isn't an actual block number then find the
|
|
|
|
* first block in this inode and use that as a hint. If that
|
|
|
|
* block is also bogus then just don't worry about it.
|
|
|
|
*/
|
|
|
|
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = search_extent_mapping(em_tree, 0, 0);
|
|
|
|
if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
if (em)
|
|
|
|
free_extent_map(em);
|
|
|
|
} else {
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
free_extent_map(em);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
read_unlock(&em_tree->lock);
|
|
|
|
|
|
|
|
return alloc_hint;
|
|
|
|
}
|
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
/*
|
|
|
|
* when extent_io.c finds a delayed allocation range in the file,
|
|
|
|
* the call backs end up in this code. The basic idea is to
|
|
|
|
* allocate extents on disk for the range, and create ordered data structs
|
|
|
|
* in ram to track those extents.
|
|
|
|
*
|
|
|
|
* locked_page is the page that writepage had locked already. We use
|
|
|
|
* it to make sure we don't do extra locks or unlocks.
|
|
|
|
*
|
|
|
|
* *page_started is set to one if we unlock locked_page and do everything
|
|
|
|
* required to start IO on it. It may be clean and already done with
|
|
|
|
* IO when we return.
|
|
|
|
*/
|
2013-08-15 02:02:47 +08:00
|
|
|
static noinline int cow_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written,
|
|
|
|
int unlock)
|
2008-11-07 11:02:51 +08:00
|
|
|
{
|
2013-08-15 02:02:47 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-11-07 11:02:51 +08:00
|
|
|
u64 alloc_hint = 0;
|
|
|
|
u64 num_bytes;
|
|
|
|
unsigned long ram_size;
|
|
|
|
u64 disk_num_bytes;
|
|
|
|
u64 cur_alloc_size;
|
|
|
|
u64 blocksize = root->sectorsize;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
int ret = 0;
|
|
|
|
|
2013-10-26 04:19:08 +08:00
|
|
|
if (btrfs_is_free_space_inode(inode)) {
|
|
|
|
WARN_ON_ONCE(1);
|
2014-02-08 01:21:23 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
2013-10-26 04:19:08 +08:00
|
|
|
}
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2013-02-26 16:10:22 +08:00
|
|
|
num_bytes = ALIGN(end - start + 1, blocksize);
|
2008-11-07 11:02:51 +08:00
|
|
|
num_bytes = max(blocksize, num_bytes);
|
|
|
|
disk_num_bytes = num_bytes;
|
|
|
|
|
2011-05-25 03:35:30 +08:00
|
|
|
/* if this is a small write inside eof, kick off defrag */
|
2015-12-15 00:42:10 +08:00
|
|
|
if (num_bytes < SZ_64K &&
|
2012-03-29 21:57:45 +08:00
|
|
|
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
|
2013-08-15 02:02:47 +08:00
|
|
|
btrfs_add_inode_defrag(NULL, inode);
|
2011-05-25 03:35:30 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
if (start == 0) {
|
|
|
|
/* lets try to make an inline extent */
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = cow_file_range_inline(root, inode, start, end, 0, 0,
|
|
|
|
NULL);
|
2008-11-07 11:02:51 +08:00
|
|
|
if (ret == 0) {
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, NULL,
|
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2013-07-30 01:22:24 +08:00
|
|
|
EXTENT_DEFRAG, PAGE_UNLOCK |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
2009-11-12 17:34:21 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
*nr_written = *nr_written +
|
|
|
|
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
|
|
|
|
*page_started = 1;
|
|
|
|
goto out;
|
2012-03-12 23:03:00 +08:00
|
|
|
} else if (ret < 0) {
|
|
|
|
goto out_unlock;
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(disk_num_bytes >
|
2011-04-13 21:41:04 +08:00
|
|
|
btrfs_super_total_bytes(root->fs_info->super_copy));
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
|
2008-11-07 11:02:51 +08:00
|
|
|
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (disk_num_bytes > 0) {
|
2009-10-08 23:27:10 +08:00
|
|
|
unsigned long op;
|
|
|
|
|
2010-03-20 02:07:23 +08:00
|
|
|
cur_alloc_size = disk_num_bytes;
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = btrfs_reserve_extent(root, cur_alloc_size,
|
2008-11-07 11:02:51 +08:00
|
|
|
root->sectorsize, 0, alloc_hint,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
&ins, 1, 1);
|
2013-08-15 02:02:47 +08:00
|
|
|
if (ret < 0)
|
2012-03-12 23:03:00 +08:00
|
|
|
goto out_unlock;
|
2009-01-06 10:25:51 +08:00
|
|
|
|
2011-04-21 06:48:27 +08:00
|
|
|
em = alloc_extent_map();
|
2013-05-14 10:12:15 +08:00
|
|
|
if (!em) {
|
|
|
|
ret = -ENOMEM;
|
2013-04-22 18:53:47 +08:00
|
|
|
goto out_reserve;
|
2013-05-14 10:12:15 +08:00
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
em->start = start;
|
2008-11-11 00:53:33 +08:00
|
|
|
em->orig_start = em->start;
|
2008-11-07 11:02:51 +08:00
|
|
|
ram_size = ins.offset;
|
|
|
|
em->len = ins.offset;
|
2012-10-13 03:27:49 +08:00
|
|
|
em->mod_start = em->start;
|
|
|
|
em->mod_len = em->len;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
em->block_start = ins.objectid;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
em->block_len = ins.offset;
|
2012-12-03 23:31:19 +08:00
|
|
|
em->orig_block_len = ins.offset;
|
2013-04-05 02:31:27 +08:00
|
|
|
em->ram_bytes = ram_size;
|
2008-07-18 00:53:50 +08:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2008-07-19 00:01:11 +08:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
2012-10-12 04:54:30 +08:00
|
|
|
em->generation = -1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2009-09-03 04:24:52 +08:00
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
2009-09-03 04:24:52 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
2008-07-18 00:53:50 +08:00
|
|
|
if (ret != -EEXIST) {
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_drop_extent_cache(inode, start,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
start + ram_size - 1, 0);
|
2008-07-18 00:53:50 +08:00
|
|
|
}
|
2013-04-22 18:53:47 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_reserve;
|
2008-07-18 00:53:50 +08:00
|
|
|
|
2008-04-14 21:46:10 +08:00
|
|
|
cur_alloc_size = ins.offset;
|
2008-07-18 00:53:50 +08:00
|
|
|
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
|
2008-11-07 11:02:51 +08:00
|
|
|
ram_size, cur_alloc_size, 0);
|
2013-04-22 18:53:47 +08:00
|
|
|
if (ret)
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 17:43:00 +08:00
|
|
|
goto out_drop_extent_cache;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-12-12 23:03:38 +08:00
|
|
|
if (root->root_key.objectid ==
|
|
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
|
|
|
ret = btrfs_reloc_clone_csums(inode, start,
|
|
|
|
cur_alloc_size);
|
2013-08-15 02:02:47 +08:00
|
|
|
if (ret)
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 17:43:00 +08:00
|
|
|
goto out_drop_extent_cache;
|
2008-12-12 23:03:38 +08:00
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
if (disk_num_bytes < cur_alloc_size)
|
2008-04-17 23:29:12 +08:00
|
|
|
break;
|
2009-01-06 10:25:51 +08:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
/* we're not doing compressed IO, don't unlock the first
|
|
|
|
* page (which the caller expects to stay locked), don't
|
|
|
|
* clear any dirty bits and don't set any writeback bits
|
2009-09-03 04:53:46 +08:00
|
|
|
*
|
|
|
|
* Do set the Private2 bit so we know this page was properly
|
|
|
|
* setup for writepage
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
*/
|
2013-07-29 23:20:47 +08:00
|
|
|
op = unlock ? PAGE_UNLOCK : 0;
|
|
|
|
op |= PAGE_SET_PRIVATE2;
|
2009-10-08 23:27:10 +08:00
|
|
|
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start,
|
|
|
|
start + ram_size - 1, locked_page,
|
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC,
|
|
|
|
op);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
disk_num_bytes -= cur_alloc_size;
|
2007-12-18 09:14:04 +08:00
|
|
|
num_bytes -= cur_alloc_size;
|
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
start += cur_alloc_size;
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
out:
|
2007-12-18 09:14:01 +08:00
|
|
|
return ret;
|
2012-11-01 15:32:18 +08:00
|
|
|
|
Btrfs: fix corruption after write/fsync failure + fsync + log recovery
While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.
Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.
Some sequence of steps that lead to this:
$ mkfs.btrfs -f /dev/sdd
$ mount -o commit=9999 /dev/sdd /mnt
$ cd /mnt
$ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
$ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
$ sync
$ od -t x1 foo
0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
*
0020000
$ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo
# Now this write + fsync fail with -ENOMEM, which was returned by
# btrfs_add_ordered_extent() in inode.c:cow_file_range().
$ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
$ xfs_io -c "fsync" foo
fsync: Cannot allocate memory
# Now do a new write + fsync, which will succeed. Our previous
# -ENOMEM was a transient/temporary error.
$ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
$ xfs_io -c "fsync" foo
# Our file content (in page cache) is now:
$ od -t x1 foo
0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
*
0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# Now reboot the machine, and mount the fs, so that fsync log replay
# takes place.
# The file content is now weird, in particular the first 8Kb, which
# do not match our data before nor after the sync command above.
$ od -t x1 foo
0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
*
0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
*
0050000
# In fact these first 4Kb are a duplicate of the last 4kb block.
# The last write got an extent map/file extent item that points to
# the same disk extent that we got in the write+fsync that failed
# with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
# verify that:
$ btrfs-debug-tree /dev/sdd
(...)
item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
extent data disk byte 12582912 nr 8192
extent data offset 0 nr 8192 ram 8192
item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 8192 ram 8192
item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
extent data disk byte 12582912 nr 4096
extent data offset 0 nr 4096 ram 4096
$ umount /dev/sdd
$ btrfsck /dev/sdd
Checking filesystem on /dev/sdd
UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
checking extents
extent item 12582912 has multiple extent items
ref mismatch on [12582912 4096] extent item 1, found 2
Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
backpointer mismatch on [12582912 4096]
Errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
root 5 inode 257 errors 1000, some csum missing
found 131074 bytes used err is 1
total csum bytes: 4
total tree bytes: 131072
total fs tree bytes: 32768
total extent tree bytes: 16384
btree space waste bytes: 123404
file data blocks allocated: 274432
referenced 274432
Btrfs v3.14.1-96-gcc7fd5a-dirty
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-25 17:43:00 +08:00
|
|
|
out_drop_extent_cache:
|
|
|
|
btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
|
2013-04-22 18:53:47 +08:00
|
|
|
out_reserve:
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
out_unlock:
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, locked_page,
|
2013-07-30 01:22:24 +08:00
|
|
|
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DELALLOC | EXTENT_DEFRAG,
|
|
|
|
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
|
|
|
|
PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
|
2012-03-12 23:03:00 +08:00
|
|
|
goto out;
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
/*
|
|
|
|
* work queue call back to started compression on a file and pages
|
|
|
|
*/
|
|
|
|
static noinline void async_cow_start(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
int num_added = 0;
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
|
|
|
|
compress_file_range(async_cow->inode, async_cow->locked_page,
|
|
|
|
async_cow->start, async_cow->end, async_cow,
|
|
|
|
&num_added);
|
2012-06-09 03:16:12 +08:00
|
|
|
if (num_added == 0) {
|
2012-06-16 02:19:48 +08:00
|
|
|
btrfs_add_delayed_iput(async_cow->inode);
|
2008-11-07 11:02:51 +08:00
|
|
|
async_cow->inode = NULL;
|
2012-06-09 03:16:12 +08:00
|
|
|
}
|
2008-11-07 11:02:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* work queue call back to submit previously compressed pages
|
|
|
|
*/
|
|
|
|
static noinline void async_cow_submit(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
|
|
|
|
root = async_cow->root;
|
|
|
|
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
|
|
|
|
PAGE_CACHE_SHIFT;
|
|
|
|
|
2015-02-17 02:41:40 +08:00
|
|
|
/*
|
|
|
|
* atomic_sub_return implies a barrier for waitqueue_active
|
|
|
|
*/
|
2012-08-02 03:36:24 +08:00
|
|
|
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
|
2015-12-15 00:42:10 +08:00
|
|
|
5 * SZ_1M &&
|
2008-11-07 11:02:51 +08:00
|
|
|
waitqueue_active(&root->fs_info->async_submit_wait))
|
|
|
|
wake_up(&root->fs_info->async_submit_wait);
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
if (async_cow->inode)
|
2008-11-07 11:02:51 +08:00
|
|
|
submit_compressed_extents(async_cow->inode, async_cow);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
static noinline void async_cow_free(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
2012-06-09 03:16:12 +08:00
|
|
|
if (async_cow->inode)
|
2012-06-16 02:19:48 +08:00
|
|
|
btrfs_add_delayed_iput(async_cow->inode);
|
2008-11-07 11:02:51 +08:00
|
|
|
kfree(async_cow);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cow_file_range_async(struct inode *inode, struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
u64 cur_end;
|
2015-12-15 00:42:10 +08:00
|
|
|
int limit = 10 * SZ_1M;
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2009-10-09 00:30:20 +08:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
|
|
|
|
1, 0, NULL, GFP_NOFS);
|
2009-01-06 10:25:51 +08:00
|
|
|
while (start < end) {
|
2008-11-07 11:02:51 +08:00
|
|
|
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!async_cow); /* -ENOMEM */
|
2012-06-09 03:16:12 +08:00
|
|
|
async_cow->inode = igrab(inode);
|
2008-11-07 11:02:51 +08:00
|
|
|
async_cow->root = root;
|
|
|
|
async_cow->locked_page = locked_page;
|
|
|
|
async_cow->start = start;
|
|
|
|
|
2014-07-17 11:44:09 +08:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
|
|
|
|
!btrfs_test_opt(root, FORCE_COMPRESS))
|
2008-11-07 11:02:51 +08:00
|
|
|
cur_end = end;
|
|
|
|
else
|
2015-12-15 00:42:10 +08:00
|
|
|
cur_end = min(end, start + SZ_512K - 1);
|
2008-11-07 11:02:51 +08:00
|
|
|
|
|
|
|
async_cow->end = cur_end;
|
|
|
|
INIT_LIST_HEAD(&async_cow->extents);
|
|
|
|
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
btrfs_init_work(&async_cow->work,
|
|
|
|
btrfs_delalloc_helper,
|
|
|
|
async_cow_start, async_cow_submit,
|
|
|
|
async_cow_free);
|
2008-11-07 11:02:51 +08:00
|
|
|
|
|
|
|
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
|
|
|
|
PAGE_CACHE_SHIFT;
|
|
|
|
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
|
|
|
|
|
2014-02-28 10:46:07 +08:00
|
|
|
btrfs_queue_work(root->fs_info->delalloc_workers,
|
|
|
|
&async_cow->work);
|
2008-11-07 11:02:51 +08:00
|
|
|
|
|
|
|
if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
|
|
|
|
wait_event(root->fs_info->async_submit_wait,
|
|
|
|
(atomic_read(&root->fs_info->async_delalloc_pages) <
|
|
|
|
limit));
|
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (atomic_read(&root->fs_info->async_submit_draining) &&
|
2008-11-07 11:02:51 +08:00
|
|
|
atomic_read(&root->fs_info->async_delalloc_pages)) {
|
|
|
|
wait_event(root->fs_info->async_submit_wait,
|
|
|
|
(atomic_read(&root->fs_info->async_delalloc_pages) ==
|
|
|
|
0));
|
|
|
|
}
|
|
|
|
|
|
|
|
*nr_written += nr_pages;
|
|
|
|
start = cur_end + 1;
|
|
|
|
}
|
|
|
|
*page_started = 1;
|
|
|
|
return 0;
|
2007-12-18 09:14:01 +08:00
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
static noinline int csum_exist_in_range(struct btrfs_root *root,
|
2008-12-12 23:03:38 +08:00
|
|
|
u64 bytenr, u64 num_bytes)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_ordered_sum *sums;
|
|
|
|
LIST_HEAD(list);
|
|
|
|
|
2009-01-07 00:42:00 +08:00
|
|
|
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
|
2011-03-08 21:14:00 +08:00
|
|
|
bytenr + num_bytes - 1, &list, 0);
|
2008-12-12 23:03:38 +08:00
|
|
|
if (ret == 0 && list_empty(&list))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (!list_empty(&list)) {
|
|
|
|
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
|
|
|
|
list_del(&sums->list);
|
|
|
|
kfree(sums);
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* when nowcow writeback call back. This checks for snapshots or COW copies
|
|
|
|
* of the extents that exist in the file, and COWs the file as required.
|
|
|
|
*
|
|
|
|
* If no cow copies or snapshots exist, we write directly to the existing
|
|
|
|
* blocks on disk
|
|
|
|
*/
|
2009-03-13 08:12:45 +08:00
|
|
|
static noinline int run_delalloc_nocow(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
2008-11-07 11:02:51 +08:00
|
|
|
u64 start, u64 end, int *page_started, int force,
|
|
|
|
unsigned long *nr_written)
|
2007-12-18 09:14:01 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-08-06 01:05:02 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
2007-12-18 09:14:01 +08:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_path *path;
|
2008-10-31 02:20:02 +08:00
|
|
|
struct btrfs_file_extent_item *fi;
|
2007-12-18 09:14:01 +08:00
|
|
|
struct btrfs_key found_key;
|
2008-10-31 02:20:02 +08:00
|
|
|
u64 cow_start;
|
|
|
|
u64 cur_offset;
|
|
|
|
u64 extent_end;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u64 extent_offset;
|
2008-10-31 02:20:02 +08:00
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 num_bytes;
|
2012-12-03 23:31:19 +08:00
|
|
|
u64 disk_num_bytes;
|
2013-04-05 02:31:27 +08:00
|
|
|
u64 ram_bytes;
|
2008-10-31 02:20:02 +08:00
|
|
|
int extent_type;
|
2012-03-12 23:03:00 +08:00
|
|
|
int ret, err;
|
2008-10-31 02:25:28 +08:00
|
|
|
int type;
|
2008-10-31 02:20:02 +08:00
|
|
|
int nocow;
|
|
|
|
int check_prev = 1;
|
2011-04-20 10:33:24 +08:00
|
|
|
bool nolock;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 ino = btrfs_ino(inode);
|
2007-12-18 09:14:01 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2012-06-01 03:58:55 +08:00
|
|
|
if (!path) {
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, locked_page,
|
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2013-07-30 01:22:24 +08:00
|
|
|
EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DEFRAG, PAGE_UNLOCK |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_CLEAR_DIRTY |
|
|
|
|
PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
return -ENOMEM;
|
2012-06-01 03:58:55 +08:00
|
|
|
}
|
2011-04-20 10:33:24 +08:00
|
|
|
|
2012-07-10 19:28:39 +08:00
|
|
|
nolock = btrfs_is_free_space_inode(inode);
|
2011-04-20 10:33:24 +08:00
|
|
|
|
|
|
|
if (nolock)
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2011-04-20 10:33:24 +08:00
|
|
|
else
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-05-28 19:00:39 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR(trans)) {
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, start, end, locked_page,
|
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC |
|
2013-07-30 01:22:24 +08:00
|
|
|
EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DEFRAG, PAGE_UNLOCK |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_CLEAR_DIRTY |
|
|
|
|
PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
2012-03-12 23:03:00 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
|
2011-04-14 00:02:53 +08:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2007-12-18 09:14:01 +08:00
|
|
|
|
2008-10-31 02:20:02 +08:00
|
|
|
cow_start = (u64)-1;
|
|
|
|
cur_offset = start;
|
|
|
|
while (1) {
|
2011-04-20 10:31:50 +08:00
|
|
|
ret = btrfs_lookup_file_extent(trans, root, path, ino,
|
2008-10-31 02:20:02 +08:00
|
|
|
cur_offset, 0);
|
2013-10-26 04:55:08 +08:00
|
|
|
if (ret < 0)
|
2012-03-12 23:03:00 +08:00
|
|
|
goto error;
|
2008-10-31 02:20:02 +08:00
|
|
|
if (ret > 0 && path->slots[0] > 0 && check_prev) {
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key,
|
|
|
|
path->slots[0] - 1);
|
2011-04-20 10:31:50 +08:00
|
|
|
if (found_key.objectid == ino &&
|
2008-10-31 02:20:02 +08:00
|
|
|
found_key.type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
check_prev = 0;
|
|
|
|
next_slot:
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
2013-10-26 04:55:08 +08:00
|
|
|
if (ret < 0)
|
2012-03-12 23:03:00 +08:00
|
|
|
goto error;
|
2008-10-31 02:20:02 +08:00
|
|
|
if (ret > 0)
|
|
|
|
break;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
}
|
2007-12-18 09:14:01 +08:00
|
|
|
|
2008-10-31 02:20:02 +08:00
|
|
|
nocow = 0;
|
|
|
|
disk_bytenr = 0;
|
2008-12-12 23:03:38 +08:00
|
|
|
num_bytes = 0;
|
2008-10-31 02:20:02 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
If we are using the NO_HOLES feature, we have a tiny time window when
running delalloc for a nodatacow inode where we can race with a concurrent
link or xattr add operation leading to a BUG_ON.
This happens because at run_delalloc_nocow() we end up casting a leaf item
of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
file extent item (struct btrfs_file_extent_item) and then analyse its
extent type field, which won't match any of the expected extent types
(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
explicit BUG_ON(1).
The following sequence diagram shows how the race happens when running a
no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
neighbour leafs:
Leaf X (has N items) Leaf Y
[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
slot N - 2 slot N - 1 slot 0
(Note the implicit hole for inode 257 regarding the [0, 8K[ range)
CPU 1 CPU 2
run_dealloc_nocow()
btrfs_lookup_file_extent()
--> searches for a key with value
(257 EXTENT_DATA 4096) in the
fs/subvol tree
--> returns us a path with
path->nodes[0] == leaf X and
path->slots[0] == N
because path->slots[0] is >=
btrfs_header_nritems(leaf X), it
calls btrfs_next_leaf()
btrfs_next_leaf()
--> releases the path
hard link added to our inode,
with key (257 INODE_REF 500)
added to the end of leaf X,
so leaf X now has N + 1 keys
--> searches for the key
(257 INODE_REF 256), because
it was the last key in leaf X
before it released the path,
with path->keep_locks set to 1
--> ends up at leaf X again and
it verifies that the key
(257 INODE_REF 256) is no longer
the last key in the leaf, so it
returns with path->nodes[0] ==
leaf X and path->slots[0] == N,
pointing to the new item with
key (257 INODE_REF 500)
the loop iteration of run_dealloc_nocow()
does not break out the loop and continues
because the key referenced in the path
at path->nodes[0] and path->slots[0] is
for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
and its offset (500) is less then our delalloc
range's end (8192)
the item pointed by the path, an inode reference item,
is (incorrectly) interpreted as a file extent item and
we get an invalid extent type, leading to the BUG_ON(1):
if (extent_type == BTRFS_FILE_EXTENT_REG ||
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
(...)
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
(...)
} else {
BUG_ON(1)
}
The same can happen if a xattr is added concurrently and ends up having
a key with an offset smaller then the delalloc's range end.
So fix this by skipping keys with a type smaller than
BTRFS_EXTENT_DATA_KEY.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-09 08:33:58 +08:00
|
|
|
if (found_key.objectid > ino)
|
|
|
|
break;
|
|
|
|
if (WARN_ON_ONCE(found_key.objectid < ino) ||
|
|
|
|
found_key.type < BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
|
|
|
}
|
|
|
|
if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
|
2008-10-31 02:20:02 +08:00
|
|
|
found_key.offset > end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.offset > cur_offset) {
|
|
|
|
extent_end = found_key.offset;
|
2009-10-09 21:57:45 +08:00
|
|
|
extent_type = 0;
|
2008-10-31 02:20:02 +08:00
|
|
|
goto out_check;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
|
2013-04-05 02:31:27 +08:00
|
|
|
ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
|
2008-10-31 02:25:28 +08:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2008-10-31 02:20:02 +08:00
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
extent_offset = btrfs_file_extent_offset(leaf, fi);
|
2008-10-31 02:20:02 +08:00
|
|
|
extent_end = found_key.offset +
|
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2012-12-03 23:31:19 +08:00
|
|
|
disk_num_bytes =
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf, fi);
|
2008-10-31 02:20:02 +08:00
|
|
|
if (extent_end <= start) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
|
|
|
}
|
2008-12-12 23:03:38 +08:00
|
|
|
if (disk_bytenr == 0)
|
|
|
|
goto out_check;
|
2008-10-31 02:20:02 +08:00
|
|
|
if (btrfs_file_extent_compression(leaf, fi) ||
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) ||
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
goto out_check;
|
2008-10-31 02:25:28 +08:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
|
|
|
|
goto out_check;
|
2008-12-12 05:30:39 +08:00
|
|
|
if (btrfs_extent_readonly(root, disk_bytenr))
|
2008-10-31 02:20:02 +08:00
|
|
|
goto out_check;
|
2011-04-20 10:31:50 +08:00
|
|
|
if (btrfs_cross_ref_exist(trans, root, ino,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
found_key.offset -
|
|
|
|
extent_offset, disk_bytenr))
|
2008-12-12 23:03:38 +08:00
|
|
|
goto out_check;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
disk_bytenr += extent_offset;
|
2008-12-12 23:03:38 +08:00
|
|
|
disk_bytenr += cur_offset - found_key.offset;
|
|
|
|
num_bytes = min(end + 1, extent_end) - cur_offset;
|
2014-03-27 11:12:25 +08:00
|
|
|
/*
|
|
|
|
* if there are pending snapshots for this root,
|
|
|
|
* we fall into common COW way.
|
|
|
|
*/
|
|
|
|
if (!nolock) {
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
err = btrfs_start_write_no_snapshoting(root);
|
2014-03-27 11:12:25 +08:00
|
|
|
if (!err)
|
|
|
|
goto out_check;
|
|
|
|
}
|
2008-12-12 23:03:38 +08:00
|
|
|
/*
|
|
|
|
* force cow if csum exists in the range.
|
|
|
|
* this ensure that csum for a given extent are
|
|
|
|
* either valid or do not exist.
|
|
|
|
*/
|
|
|
|
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
|
|
|
|
goto out_check;
|
2008-10-31 02:20:02 +08:00
|
|
|
nocow = 1;
|
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
extent_end = found_key.offset +
|
2014-01-04 13:07:00 +08:00
|
|
|
btrfs_file_extent_inline_len(leaf,
|
|
|
|
path->slots[0], fi);
|
2008-10-31 02:20:02 +08:00
|
|
|
extent_end = ALIGN(extent_end, root->sectorsize);
|
|
|
|
} else {
|
|
|
|
BUG_ON(1);
|
|
|
|
}
|
|
|
|
out_check:
|
|
|
|
if (extent_end <= start) {
|
|
|
|
path->slots[0]++;
|
2014-03-27 11:12:25 +08:00
|
|
|
if (!nolock && nocow)
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2008-10-31 02:20:02 +08:00
|
|
|
goto next_slot;
|
|
|
|
}
|
|
|
|
if (!nocow) {
|
|
|
|
if (cow_start == (u64)-1)
|
|
|
|
cow_start = cur_offset;
|
|
|
|
cur_offset = extent_end;
|
|
|
|
if (cur_offset > end)
|
|
|
|
break;
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
2008-08-06 01:05:02 +08:00
|
|
|
}
|
|
|
|
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2008-10-31 02:20:02 +08:00
|
|
|
if (cow_start != (u64)-1) {
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = cow_file_range(inode, locked_page,
|
|
|
|
cow_start, found_key.offset - 1,
|
|
|
|
page_started, nr_written, 1);
|
2014-03-27 11:12:25 +08:00
|
|
|
if (ret) {
|
|
|
|
if (!nolock && nocow)
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2012-03-12 23:03:00 +08:00
|
|
|
goto error;
|
2014-03-27 11:12:25 +08:00
|
|
|
}
|
2008-10-31 02:20:02 +08:00
|
|
|
cow_start = (u64)-1;
|
2008-08-06 01:05:02 +08:00
|
|
|
}
|
2008-10-31 02:20:02 +08:00
|
|
|
|
2008-10-31 02:25:28 +08:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
em_tree = &BTRFS_I(inode)->extent_tree;
|
2011-04-21 06:48:27 +08:00
|
|
|
em = alloc_extent_map();
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!em); /* -ENOMEM */
|
2008-10-31 02:25:28 +08:00
|
|
|
em->start = cur_offset;
|
2012-10-12 04:54:30 +08:00
|
|
|
em->orig_start = found_key.offset - extent_offset;
|
2008-10-31 02:25:28 +08:00
|
|
|
em->len = num_bytes;
|
|
|
|
em->block_len = num_bytes;
|
|
|
|
em->block_start = disk_bytenr;
|
2012-12-03 23:31:19 +08:00
|
|
|
em->orig_block_len = disk_num_bytes;
|
2013-04-05 02:31:27 +08:00
|
|
|
em->ram_bytes = ram_bytes;
|
2008-10-31 02:25:28 +08:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2012-10-13 03:27:49 +08:00
|
|
|
em->mod_start = em->start;
|
|
|
|
em->mod_len = em->len;
|
2008-10-31 02:25:28 +08:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
2012-12-03 23:58:15 +08:00
|
|
|
set_bit(EXTENT_FLAG_FILLING, &em->flags);
|
2012-10-12 04:54:30 +08:00
|
|
|
em->generation = -1;
|
2008-10-31 02:25:28 +08:00
|
|
|
while (1) {
|
2009-09-03 04:24:52 +08:00
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
2009-09-03 04:24:52 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
2008-10-31 02:25:28 +08:00
|
|
|
if (ret != -EEXIST) {
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_drop_extent_cache(inode, em->start,
|
|
|
|
em->start + em->len - 1, 0);
|
|
|
|
}
|
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
} else {
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
|
|
|
}
|
2008-10-31 02:20:02 +08:00
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
|
2008-10-31 02:25:28 +08:00
|
|
|
num_bytes, num_bytes, type);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2010-05-16 22:49:59 +08:00
|
|
|
if (root->root_key.objectid ==
|
|
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
|
|
|
ret = btrfs_reloc_clone_csums(inode, cur_offset,
|
|
|
|
num_bytes);
|
2014-03-27 11:12:25 +08:00
|
|
|
if (ret) {
|
|
|
|
if (!nolock && nocow)
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2012-03-12 23:03:00 +08:00
|
|
|
goto error;
|
2014-03-27 11:12:25 +08:00
|
|
|
}
|
2010-05-16 22:49:59 +08:00
|
|
|
}
|
|
|
|
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, cur_offset,
|
|
|
|
cur_offset + num_bytes - 1,
|
|
|
|
locked_page, EXTENT_LOCKED |
|
|
|
|
EXTENT_DELALLOC, PAGE_UNLOCK |
|
|
|
|
PAGE_SET_PRIVATE2);
|
2014-03-27 11:12:25 +08:00
|
|
|
if (!nolock && nocow)
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2008-10-31 02:20:02 +08:00
|
|
|
cur_offset = extent_end;
|
|
|
|
if (cur_offset > end)
|
|
|
|
break;
|
2007-12-18 09:14:01 +08:00
|
|
|
}
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2008-10-31 02:20:02 +08:00
|
|
|
|
2012-06-01 03:58:55 +08:00
|
|
|
if (cur_offset <= end && cow_start == (u64)-1) {
|
2008-10-31 02:20:02 +08:00
|
|
|
cow_start = cur_offset;
|
2012-06-01 03:58:55 +08:00
|
|
|
cur_offset = end;
|
|
|
|
}
|
|
|
|
|
2008-10-31 02:20:02 +08:00
|
|
|
if (cow_start != (u64)-1) {
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = cow_file_range(inode, locked_page, cow_start, end,
|
|
|
|
page_started, nr_written, 1);
|
2013-10-26 04:55:08 +08:00
|
|
|
if (ret)
|
2012-03-12 23:03:00 +08:00
|
|
|
goto error;
|
2008-10-31 02:20:02 +08:00
|
|
|
}
|
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
error:
|
2012-09-20 15:51:59 +08:00
|
|
|
err = btrfs_end_transaction(trans, root);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (!ret)
|
|
|
|
ret = err;
|
|
|
|
|
2012-06-01 03:58:55 +08:00
|
|
|
if (ret && cur_offset < end)
|
2013-07-29 23:20:47 +08:00
|
|
|
extent_clear_unlock_delalloc(inode, cur_offset, end,
|
|
|
|
locked_page, EXTENT_LOCKED |
|
2013-07-30 01:22:24 +08:00
|
|
|
EXTENT_DELALLOC | EXTENT_DEFRAG |
|
|
|
|
EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
|
|
|
|
PAGE_CLEAR_DIRTY |
|
2013-07-29 23:20:47 +08:00
|
|
|
PAGE_SET_WRITEBACK |
|
|
|
|
PAGE_END_WRITEBACK);
|
2008-08-06 01:05:02 +08:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 23:03:00 +08:00
|
|
|
return ret;
|
2007-12-18 09:14:01 +08:00
|
|
|
}
|
|
|
|
|
2014-07-03 18:22:07 +08:00
|
|
|
static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* @defrag_bytes is a hint value, no spinlock held here,
|
|
|
|
* if is not zero, it means the file is defragging.
|
|
|
|
* Force cow if given extent needs to be defragged.
|
|
|
|
*/
|
|
|
|
if (BTRFS_I(inode)->defrag_bytes &&
|
|
|
|
test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
|
|
|
|
EXTENT_DEFRAG, 0, NULL))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* extent_io.c call back to do delayed allocation processing
|
|
|
|
*/
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
static int run_delalloc_range(struct inode *inode, struct page *locked_page,
|
2008-11-07 11:02:51 +08:00
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written)
|
2007-12-18 09:14:01 +08:00
|
|
|
{
|
|
|
|
int ret;
|
2014-07-03 18:22:07 +08:00
|
|
|
int force_cow = need_force_cow(inode, start, end);
|
2008-06-26 04:01:30 +08:00
|
|
|
|
2014-07-03 18:22:07 +08:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2009-01-06 10:25:51 +08:00
|
|
|
page_started, 1, nr_written);
|
2014-07-03 18:22:07 +08:00
|
|
|
} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
|
2008-10-31 02:25:28 +08:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2009-01-06 10:25:51 +08:00
|
|
|
page_started, 0, nr_written);
|
2014-07-17 11:44:10 +08:00
|
|
|
} else if (!inode_need_compress(inode)) {
|
2009-03-13 08:12:45 +08:00
|
|
|
ret = cow_file_range(inode, locked_page, start, end,
|
|
|
|
page_started, nr_written, 1);
|
2012-06-09 03:26:47 +08:00
|
|
|
} else {
|
|
|
|
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2008-11-07 11:02:51 +08:00
|
|
|
ret = cow_file_range_async(inode, locked_page, start, end,
|
2009-01-06 10:25:51 +08:00
|
|
|
page_started, nr_written);
|
2012-06-09 03:26:47 +08:00
|
|
|
}
|
2007-08-28 04:49:44 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-07-22 00:56:09 +08:00
|
|
|
static void btrfs_split_extent_hook(struct inode *inode,
|
|
|
|
struct extent_state *orig, u64 split)
|
2009-09-12 04:12:44 +08:00
|
|
|
{
|
2015-02-12 04:08:59 +08:00
|
|
|
u64 size;
|
|
|
|
|
2010-05-16 22:48:47 +08:00
|
|
|
/* not delalloc, ignore it */
|
2009-09-12 04:12:44 +08:00
|
|
|
if (!(orig->state & EXTENT_DELALLOC))
|
2011-07-22 00:56:09 +08:00
|
|
|
return;
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2015-02-12 04:08:59 +08:00
|
|
|
size = orig->end - orig->start + 1;
|
|
|
|
if (size > BTRFS_MAX_EXTENT_SIZE) {
|
|
|
|
u64 num_extents;
|
|
|
|
u64 new_size;
|
|
|
|
|
|
|
|
/*
|
2015-03-14 03:01:24 +08:00
|
|
|
* See the explanation in btrfs_merge_extent_hook, the same
|
|
|
|
* applies here, just in reverse.
|
2015-02-12 04:08:59 +08:00
|
|
|
*/
|
|
|
|
new_size = orig->end - split + 1;
|
2015-03-14 03:01:24 +08:00
|
|
|
num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
|
2015-02-12 04:08:59 +08:00
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
2015-03-14 03:01:24 +08:00
|
|
|
new_size = split - orig->start;
|
|
|
|
num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
|
|
|
if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE) >= num_extents)
|
2015-02-12 04:08:59 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2011-07-15 23:16:44 +08:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
BTRFS_I(inode)->outstanding_extents++;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2009-09-12 04:12:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* extent_io.c merge_extent_hook, used to track merged delayed allocation
|
|
|
|
* extents so we can keep track of new extents that are just merged onto old
|
|
|
|
* extents, such as when we are doing sequential writes, so we can properly
|
|
|
|
* account for the metadata space we'll need.
|
|
|
|
*/
|
2011-07-22 00:56:09 +08:00
|
|
|
static void btrfs_merge_extent_hook(struct inode *inode,
|
|
|
|
struct extent_state *new,
|
|
|
|
struct extent_state *other)
|
2009-09-12 04:12:44 +08:00
|
|
|
{
|
2015-02-12 04:08:59 +08:00
|
|
|
u64 new_size, old_size;
|
|
|
|
u64 num_extents;
|
|
|
|
|
2009-09-12 04:12:44 +08:00
|
|
|
/* not delalloc, ignore it */
|
|
|
|
if (!(other->state & EXTENT_DELALLOC))
|
2011-07-22 00:56:09 +08:00
|
|
|
return;
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2015-03-14 03:12:08 +08:00
|
|
|
if (new->start > other->start)
|
|
|
|
new_size = new->end - other->start + 1;
|
|
|
|
else
|
|
|
|
new_size = other->end - new->start + 1;
|
2015-02-12 04:08:59 +08:00
|
|
|
|
|
|
|
/* we're not bigger than the max, unreserve the space and go */
|
|
|
|
if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
BTRFS_I(inode)->outstanding_extents--;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-03-14 03:01:24 +08:00
|
|
|
* We have to add up either side to figure out how many extents were
|
|
|
|
* accounted for before we merged into one big extent. If the number of
|
|
|
|
* extents we accounted for is <= the amount we need for the new range
|
|
|
|
* then we can return, otherwise drop. Think of it like this
|
|
|
|
*
|
|
|
|
* [ 4k][MAX_SIZE]
|
|
|
|
*
|
|
|
|
* So we've grown the extent by a MAX_SIZE extent, this would mean we
|
|
|
|
* need 2 outstanding extents, on one side we have 1 and the other side
|
|
|
|
* we have 1 so they are == and we can return. But in this case
|
|
|
|
*
|
|
|
|
* [MAX_SIZE+4k][MAX_SIZE+4k]
|
|
|
|
*
|
|
|
|
* Each range on their own accounts for 2 extents, but merged together
|
|
|
|
* they are only 3 extents worth of accounting, so we need to drop in
|
|
|
|
* this case.
|
2015-02-12 04:08:59 +08:00
|
|
|
*/
|
2015-03-14 03:01:24 +08:00
|
|
|
old_size = other->end - other->start + 1;
|
2015-02-12 04:08:59 +08:00
|
|
|
num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
2015-03-14 03:01:24 +08:00
|
|
|
old_size = new->end - new->start + 1;
|
|
|
|
num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
|
|
|
|
2015-02-12 04:08:59 +08:00
|
|
|
if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
|
2015-03-14 03:01:24 +08:00
|
|
|
BTRFS_MAX_EXTENT_SIZE) >= num_extents)
|
2015-02-12 04:08:59 +08:00
|
|
|
return;
|
|
|
|
|
2011-07-15 23:16:44 +08:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
BTRFS_I(inode)->outstanding_extents--;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2009-09-12 04:12:44 +08:00
|
|
|
}
|
|
|
|
|
2013-05-15 15:48:22 +08:00
|
|
|
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
|
|
|
|
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
|
|
|
|
&root->delalloc_inodes);
|
|
|
|
set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
root->nr_delalloc_inodes++;
|
|
|
|
if (root->nr_delalloc_inodes == 1) {
|
|
|
|
spin_lock(&root->fs_info->delalloc_root_lock);
|
|
|
|
BUG_ON(!list_empty(&root->delalloc_root));
|
|
|
|
list_add_tail(&root->delalloc_root,
|
|
|
|
&root->fs_info->delalloc_roots);
|
|
|
|
spin_unlock(&root->fs_info->delalloc_root_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_del_delalloc_inode(struct btrfs_root *root,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
|
|
|
|
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
|
|
|
|
clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
root->nr_delalloc_inodes--;
|
|
|
|
if (!root->nr_delalloc_inodes) {
|
|
|
|
spin_lock(&root->fs_info->delalloc_root_lock);
|
|
|
|
BUG_ON(list_empty(&root->delalloc_root));
|
|
|
|
list_del_init(&root->delalloc_root);
|
|
|
|
spin_unlock(&root->fs_info->delalloc_root_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* extent_io.c set_bit_hook, used to track delayed allocation
|
|
|
|
* bytes in this file, and to maintain the list of inodes that
|
|
|
|
* have pending delalloc work to be done.
|
|
|
|
*/
|
2011-07-22 00:56:09 +08:00
|
|
|
static void btrfs_set_bit_hook(struct inode *inode,
|
2015-01-15 02:52:13 +08:00
|
|
|
struct extent_state *state, unsigned *bits)
|
2008-01-30 04:55:23 +08:00
|
|
|
{
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2014-07-03 18:22:07 +08:00
|
|
|
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
|
|
|
|
WARN_ON(1);
|
2008-12-16 04:54:40 +08:00
|
|
|
/*
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-21 04:20:32 +08:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-16 04:54:40 +08:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
*/
|
2010-05-16 22:48:47 +08:00
|
|
|
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
|
2008-01-30 04:55:23 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 22:48:47 +08:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2012-07-10 19:28:39 +08:00
|
|
|
bool do_list = !btrfs_is_free_space_inode(inode);
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2011-07-15 23:16:44 +08:00
|
|
|
if (*bits & EXTENT_FIRST_DELALLOC) {
|
2010-05-16 22:48:47 +08:00
|
|
|
*bits &= ~EXTENT_FIRST_DELALLOC;
|
2011-07-15 23:16:44 +08:00
|
|
|
} else {
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
BTRFS_I(inode)->outstanding_extents++;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
}
|
2010-03-20 02:07:23 +08:00
|
|
|
|
2015-03-17 05:38:52 +08:00
|
|
|
/* For sanity tests */
|
|
|
|
if (btrfs_test_is_dummy_root(root))
|
|
|
|
return;
|
|
|
|
|
2013-01-29 18:10:51 +08:00
|
|
|
__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
|
|
|
|
root->fs_info->delalloc_batch);
|
2013-01-29 18:11:59 +08:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2010-05-16 22:48:47 +08:00
|
|
|
BTRFS_I(inode)->delalloc_bytes += len;
|
2014-07-03 18:22:07 +08:00
|
|
|
if (*bits & EXTENT_DEFRAG)
|
|
|
|
BTRFS_I(inode)->defrag_bytes += len;
|
2013-01-29 18:11:59 +08:00
|
|
|
if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2013-05-15 15:48:22 +08:00
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
|
|
|
btrfs_add_delalloc_inodes(root, inode);
|
2013-01-29 18:11:59 +08:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2008-01-30 04:55:23 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* extent_io.c clear_bit_hook, see set_bit_hook for why
|
|
|
|
*/
|
2011-07-22 00:56:09 +08:00
|
|
|
static void btrfs_clear_bit_hook(struct inode *inode,
|
2013-04-29 21:38:46 +08:00
|
|
|
struct extent_state *state,
|
2015-01-15 02:52:13 +08:00
|
|
|
unsigned *bits)
|
2008-01-30 04:55:23 +08:00
|
|
|
{
|
2014-07-03 18:22:07 +08:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2015-02-12 04:08:59 +08:00
|
|
|
u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
2014-07-03 18:22:07 +08:00
|
|
|
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
|
|
|
|
BTRFS_I(inode)->defrag_bytes -= len;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
|
2008-12-16 04:54:40 +08:00
|
|
|
/*
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-21 04:20:32 +08:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-16 04:54:40 +08:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
*/
|
2010-05-16 22:48:47 +08:00
|
|
|
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
|
2008-01-30 04:55:23 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2012-07-10 19:28:39 +08:00
|
|
|
bool do_list = !btrfs_is_free_space_inode(inode);
|
2008-04-23 01:26:47 +08:00
|
|
|
|
2011-07-15 23:16:44 +08:00
|
|
|
if (*bits & EXTENT_FIRST_DELALLOC) {
|
2010-05-16 22:48:47 +08:00
|
|
|
*bits &= ~EXTENT_FIRST_DELALLOC;
|
2011-07-15 23:16:44 +08:00
|
|
|
} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2015-02-12 04:08:59 +08:00
|
|
|
BTRFS_I(inode)->outstanding_extents -= num_extents;
|
2011-07-15 23:16:44 +08:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
}
|
2010-05-16 22:48:47 +08:00
|
|
|
|
2013-09-28 02:57:43 +08:00
|
|
|
/*
|
|
|
|
* We don't reserve metadata space for space cache inodes so we
|
|
|
|
* don't need to call dellalloc_release_metadata if there is an
|
|
|
|
* error.
|
|
|
|
*/
|
|
|
|
if (*bits & EXTENT_DO_ACCOUNTING &&
|
|
|
|
root != root->fs_info->tree_root)
|
2010-05-16 22:48:47 +08:00
|
|
|
btrfs_delalloc_release_metadata(inode, len);
|
|
|
|
|
2015-03-17 05:38:52 +08:00
|
|
|
/* For sanity tests. */
|
|
|
|
if (btrfs_test_is_dummy_root(root))
|
|
|
|
return;
|
|
|
|
|
2010-07-03 00:14:14 +08:00
|
|
|
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
|
2013-06-22 04:37:03 +08:00
|
|
|
&& do_list && !(state->state & EXTENT_NORESERVE))
|
2015-10-08 18:19:37 +08:00
|
|
|
btrfs_free_reserved_data_space_noquota(inode,
|
|
|
|
state->start, len);
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2013-01-29 18:10:51 +08:00
|
|
|
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
|
|
|
|
root->fs_info->delalloc_batch);
|
2013-01-29 18:11:59 +08:00
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
2010-05-16 22:48:47 +08:00
|
|
|
BTRFS_I(inode)->delalloc_bytes -= len;
|
2010-07-03 00:14:14 +08:00
|
|
|
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
|
2013-01-29 18:11:59 +08:00
|
|
|
test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
|
2013-05-15 15:48:22 +08:00
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
|
|
|
btrfs_del_delalloc_inode(root, inode);
|
2013-01-29 18:11:59 +08:00
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2008-01-30 04:55:23 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* extent_io.c merge_bio_hook, this must check the chunk tree to make sure
|
|
|
|
* we don't create bios that span stripes or chunks
|
|
|
|
*/
|
2009-07-16 06:29:37 +08:00
|
|
|
int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
size_t size, struct bio *bio,
|
|
|
|
unsigned long bio_flags)
|
2008-03-25 03:02:07 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
|
2013-10-12 06:44:27 +08:00
|
|
|
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
|
2008-03-25 03:02:07 +08:00
|
|
|
u64 length = 0;
|
|
|
|
u64 map_length;
|
|
|
|
int ret;
|
|
|
|
|
2008-11-07 11:02:51 +08:00
|
|
|
if (bio_flags & EXTENT_BIO_COMPRESSED)
|
|
|
|
return 0;
|
|
|
|
|
2013-10-12 06:44:27 +08:00
|
|
|
length = bio->bi_iter.bi_size;
|
2008-03-25 03:02:07 +08:00
|
|
|
map_length = length;
|
2009-07-16 06:29:37 +08:00
|
|
|
ret = btrfs_map_block(root->fs_info, rw, logical,
|
2008-04-10 04:28:12 +08:00
|
|
|
&map_length, NULL, 0);
|
2012-11-05 22:46:42 +08:00
|
|
|
/* Will always return 0 with map_multi == NULL */
|
2011-10-04 11:23:13 +08:00
|
|
|
BUG_ON(ret < 0);
|
2009-01-06 10:25:51 +08:00
|
|
|
if (map_length < length + size)
|
2008-03-25 03:02:07 +08:00
|
|
|
return 1;
|
2011-10-04 11:23:13 +08:00
|
|
|
return 0;
|
2008-03-25 03:02:07 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
*
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
* are inserted into the btree
|
|
|
|
*/
|
2009-01-06 10:25:51 +08:00
|
|
|
static int __btrfs_submit_bio_start(struct inode *inode, int rw,
|
|
|
|
struct bio *bio, int mirror_num,
|
2010-05-25 21:48:28 +08:00
|
|
|
unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
2008-02-21 01:07:25 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret = 0;
|
2008-04-16 23:15:20 +08:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-09 05:58:54 +08:00
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 11:03:00 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2008-04-16 23:15:20 +08:00
|
|
|
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 11:03:00 +08:00
|
|
|
/*
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
*
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
* are inserted into the btree
|
|
|
|
*/
|
2008-12-02 22:54:17 +08:00
|
|
|
static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
|
2010-05-25 21:48:28 +08:00
|
|
|
int mirror_num, unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 11:03:00 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2012-11-06 01:51:52 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
|
2015-07-20 21:29:37 +08:00
|
|
|
if (ret) {
|
|
|
|
bio->bi_error = ret;
|
|
|
|
bio_endio(bio);
|
|
|
|
}
|
2012-11-06 01:51:52 +08:00
|
|
|
return ret;
|
2008-04-16 23:14:51 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
2008-12-18 03:51:42 +08:00
|
|
|
* extent_io.c submission hook. This does the right thing for csum calculation
|
|
|
|
* on write, or reading the csums from the tree before a read
|
2008-09-30 03:18:18 +08:00
|
|
|
*/
|
2008-12-02 22:54:17 +08:00
|
|
|
static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
|
2010-05-25 21:48:28 +08:00
|
|
|
int mirror_num, unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
2008-04-16 23:14:51 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2015-07-27 17:56:43 +08:00
|
|
|
enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
|
2008-04-16 23:14:51 +08:00
|
|
|
int ret = 0;
|
2008-10-31 02:23:13 +08:00
|
|
|
int skip_sum;
|
2012-11-17 02:56:32 +08:00
|
|
|
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
|
2008-04-16 23:14:51 +08:00
|
|
|
|
2009-04-17 16:37:41 +08:00
|
|
|
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
2008-12-18 03:51:42 +08:00
|
|
|
|
2012-07-10 19:28:39 +08:00
|
|
|
if (btrfs_is_free_space_inode(inode))
|
2015-07-27 17:56:43 +08:00
|
|
|
metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
|
2011-10-04 11:23:12 +08:00
|
|
|
|
2010-08-08 00:20:39 +08:00
|
|
|
if (!(rw & REQ_WRITE)) {
|
2012-05-03 02:00:54 +08:00
|
|
|
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
|
|
|
|
if (ret)
|
2012-11-06 01:51:52 +08:00
|
|
|
goto out;
|
2012-05-03 02:00:54 +08:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-09 05:58:54 +08:00
|
|
|
if (bio_flags & EXTENT_BIO_COMPRESSED) {
|
2012-11-06 01:51:52 +08:00
|
|
|
ret = btrfs_submit_compressed_read(inode, bio,
|
|
|
|
mirror_num,
|
|
|
|
bio_flags);
|
|
|
|
goto out;
|
2011-03-01 14:48:31 +08:00
|
|
|
} else if (!skip_sum) {
|
|
|
|
ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
|
|
|
|
if (ret)
|
2012-11-06 01:51:52 +08:00
|
|
|
goto out;
|
2011-03-01 14:48:31 +08:00
|
|
|
}
|
2008-08-20 21:44:52 +08:00
|
|
|
goto mapit;
|
2012-11-17 02:56:32 +08:00
|
|
|
} else if (async && !skip_sum) {
|
2008-12-12 23:03:38 +08:00
|
|
|
/* csum items have already been cloned */
|
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
|
|
|
|
goto mapit;
|
2008-10-31 02:23:13 +08:00
|
|
|
/* we're doing a write, do the async checksumming */
|
2012-11-06 01:51:52 +08:00
|
|
|
ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
|
2008-04-16 23:14:51 +08:00
|
|
|
inode, rw, bio, mirror_num,
|
2010-05-25 21:48:28 +08:00
|
|
|
bio_flags, bio_offset,
|
|
|
|
__btrfs_submit_bio_start,
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-07 11:03:00 +08:00
|
|
|
__btrfs_submit_bio_done);
|
2012-11-06 01:51:52 +08:00
|
|
|
goto out;
|
2012-11-17 02:56:32 +08:00
|
|
|
} else if (!skip_sum) {
|
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-10-31 02:23:13 +08:00
|
|
|
}
|
|
|
|
|
2008-03-25 03:01:56 +08:00
|
|
|
mapit:
|
2012-11-06 01:51:52 +08:00
|
|
|
ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
|
|
|
|
|
|
|
|
out:
|
2015-07-20 21:29:37 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
bio->bi_error = ret;
|
|
|
|
bio_endio(bio);
|
|
|
|
}
|
2012-11-06 01:51:52 +08:00
|
|
|
return ret;
|
2008-02-21 01:07:25 +08:00
|
|
|
}
|
2008-02-21 05:11:05 +08:00
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* given a list of ordered sums record them in the inode. This happens
|
|
|
|
* at IO completion time based on sums calculated at bio submission time.
|
|
|
|
*/
|
2008-07-18 00:54:15 +08:00
|
|
|
static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
|
2008-07-18 00:53:50 +08:00
|
|
|
struct inode *inode, u64 file_offset,
|
|
|
|
struct list_head *list)
|
|
|
|
{
|
|
|
|
struct btrfs_ordered_sum *sum;
|
|
|
|
|
2009-01-21 23:59:08 +08:00
|
|
|
list_for_each_entry(sum, list, list) {
|
2013-03-28 16:08:20 +08:00
|
|
|
trans->adding_csums = 1;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-09 05:58:54 +08:00
|
|
|
btrfs_csum_file_blocks(trans,
|
|
|
|
BTRFS_I(inode)->root->fs_info->csum_root, sum);
|
2013-03-28 16:08:20 +08:00
|
|
|
trans->adding_csums = 0;
|
2008-07-18 00:53:50 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
|
|
|
|
struct extent_state **cached_state)
|
2008-08-05 11:17:27 +08:00
|
|
|
{
|
2012-11-04 04:30:18 +08:00
|
|
|
WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
|
2008-08-05 11:17:27 +08:00
|
|
|
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
|
2010-02-04 03:33:23 +08:00
|
|
|
cached_state, GFP_NOFS);
|
2008-08-05 11:17:27 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/* see btrfs_writepage_start_hook for details on why this is required */
|
2008-07-18 00:53:51 +08:00
|
|
|
struct btrfs_writepage_fixup {
|
|
|
|
struct page *page;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
2008-12-02 22:54:17 +08:00
|
|
|
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
|
2008-07-18 00:53:51 +08:00
|
|
|
{
|
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-04 03:33:23 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-18 00:53:51 +08:00
|
|
|
struct page *page;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 page_start;
|
|
|
|
u64 page_end;
|
2012-02-15 23:23:57 +08:00
|
|
|
int ret;
|
2008-07-18 00:53:51 +08:00
|
|
|
|
|
|
|
fixup = container_of(work, struct btrfs_writepage_fixup, work);
|
|
|
|
page = fixup->page;
|
2008-07-21 22:29:44 +08:00
|
|
|
again:
|
2008-07-18 00:53:51 +08:00
|
|
|
lock_page(page);
|
|
|
|
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto out_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = page->mapping->host;
|
|
|
|
page_start = page_offset(page);
|
|
|
|
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
|
|
|
|
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2012-03-01 21:57:19 +08:00
|
|
|
&cached_state);
|
2008-07-21 22:29:44 +08:00
|
|
|
|
|
|
|
/* already ordered? We're done */
|
2009-09-03 04:53:46 +08:00
|
|
|
if (PagePrivate2(page))
|
2008-07-18 00:53:51 +08:00
|
|
|
goto out;
|
2008-07-21 22:29:44 +08:00
|
|
|
|
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
|
|
|
if (ordered) {
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
|
|
|
|
page_end, &cached_state, GFP_NOFS);
|
2008-07-21 22:29:44 +08:00
|
|
|
unlock_page(page);
|
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2012-02-15 23:23:57 +08:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
2008-07-21 22:29:44 +08:00
|
|
|
goto again;
|
|
|
|
}
|
2008-07-18 00:53:51 +08:00
|
|
|
|
2015-09-08 17:25:55 +08:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, page_start,
|
|
|
|
PAGE_CACHE_SIZE);
|
2012-02-15 23:23:57 +08:00
|
|
|
if (ret) {
|
|
|
|
mapping_set_error(page->mapping, ret);
|
|
|
|
end_extent_writepage(page, ret, page_start, page_end);
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
|
2008-07-18 00:53:51 +08:00
|
|
|
ClearPageChecked(page);
|
2012-02-15 23:23:57 +08:00
|
|
|
set_page_dirty(page);
|
2008-07-18 00:53:51 +08:00
|
|
|
out:
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-18 00:53:51 +08:00
|
|
|
out_page:
|
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
2011-01-26 16:19:22 +08:00
|
|
|
kfree(fixup);
|
2008-07-18 00:53:51 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are a few paths in the higher layers of the kernel that directly
|
|
|
|
* set the page dirty bit without asking the filesystem if it is a
|
|
|
|
* good idea. This causes problems because we want to make sure COW
|
|
|
|
* properly happens and the data=ordered rules are followed.
|
|
|
|
*
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
* In our case any range that doesn't have the ORDERED bit set
|
2008-07-18 00:53:51 +08:00
|
|
|
* hasn't been properly setup for IO. We kick off an async process
|
|
|
|
* to fix it up. The async helper will wait for ordered extents, set
|
|
|
|
* the delalloc bit and make it safe to write the page.
|
|
|
|
*/
|
2008-12-02 22:54:17 +08:00
|
|
|
static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
|
2008-07-18 00:53:51 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
2009-09-03 04:53:46 +08:00
|
|
|
/* this page is properly in the ordered list */
|
|
|
|
if (TestClearPagePrivate2(page))
|
2008-07-18 00:53:51 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (PageChecked(page))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
|
|
|
|
if (!fixup)
|
|
|
|
return -EAGAIN;
|
2008-07-22 23:18:09 +08:00
|
|
|
|
2008-07-18 00:53:51 +08:00
|
|
|
SetPageChecked(page);
|
|
|
|
page_cache_get(page);
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
btrfs_init_work(&fixup->work, btrfs_fixup_helper,
|
|
|
|
btrfs_writepage_fixup_worker, NULL, NULL);
|
2008-07-18 00:53:51 +08:00
|
|
|
fixup->page = page;
|
2014-02-28 10:46:14 +08:00
|
|
|
btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
|
2012-02-15 23:23:57 +08:00
|
|
|
return -EBUSY;
|
2008-07-18 00:53:51 +08:00
|
|
|
}
|
|
|
|
|
2008-10-31 02:25:28 +08:00
|
|
|
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *inode, u64 file_pos,
|
|
|
|
u64 disk_bytenr, u64 disk_num_bytes,
|
|
|
|
u64 num_bytes, u64 ram_bytes,
|
|
|
|
u8 compression, u8 encryption,
|
|
|
|
u16 other_encoding, int extent_type)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key ins;
|
2014-01-07 19:42:27 +08:00
|
|
|
int extent_inserted = 0;
|
2008-10-31 02:25:28 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2008-10-31 02:25:28 +08:00
|
|
|
|
2009-09-12 00:27:37 +08:00
|
|
|
/*
|
|
|
|
* we may be replacing one extent in the tree with another.
|
|
|
|
* The new extent is pinned in the extent map, and we don't want
|
|
|
|
* to drop it from the cache until it is completely in the btree.
|
|
|
|
*
|
|
|
|
* So, tell btrfs_drop_extents to leave this extent in the cache.
|
|
|
|
* the caller is expected to unpin it and allow it to be merged
|
|
|
|
* with the others.
|
|
|
|
*/
|
2014-01-07 19:42:27 +08:00
|
|
|
ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
|
|
|
|
file_pos + num_bytes, NULL, 0,
|
|
|
|
1, sizeof(*fi), &extent_inserted);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-10-31 02:25:28 +08:00
|
|
|
|
2014-01-07 19:42:27 +08:00
|
|
|
if (!extent_inserted) {
|
|
|
|
ins.objectid = btrfs_ino(inode);
|
|
|
|
ins.offset = file_pos;
|
|
|
|
ins.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
|
|
|
|
path->leave_spinning = 1;
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &ins,
|
|
|
|
sizeof(*fi));
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
2008-10-31 02:25:28 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, fi, extent_type);
|
|
|
|
btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
|
|
|
|
btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
|
|
|
|
btrfs_set_file_extent_offset(leaf, fi, 0);
|
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
|
|
|
|
btrfs_set_file_extent_compression(leaf, fi, compression);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, fi, encryption);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
|
2009-03-13 23:00:37 +08:00
|
|
|
|
2008-10-31 02:25:28 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2012-09-26 03:26:16 +08:00
|
|
|
btrfs_release_path(path);
|
2008-10-31 02:25:28 +08:00
|
|
|
|
|
|
|
inode_add_bytes(inode, num_bytes);
|
|
|
|
|
|
|
|
ins.objectid = disk_bytenr;
|
|
|
|
ins.offset = disk_num_bytes;
|
|
|
|
ins.type = BTRFS_EXTENT_ITEM_KEY;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
ret = btrfs_alloc_reserved_file_extent(trans, root,
|
|
|
|
root->root_key.objectid,
|
2015-10-26 14:11:18 +08:00
|
|
|
btrfs_ino(inode), file_pos,
|
|
|
|
ram_bytes, &ins);
|
2015-09-08 17:08:37 +08:00
|
|
|
/*
|
2015-10-26 14:11:18 +08:00
|
|
|
* Release the reserved range from inode dirty range map, as it is
|
|
|
|
* already moved into delayed_ref_head
|
2015-09-08 17:08:37 +08:00
|
|
|
*/
|
|
|
|
btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
|
2012-03-12 23:03:00 +08:00
|
|
|
out:
|
2008-10-31 02:25:28 +08:00
|
|
|
btrfs_free_path(path);
|
2009-03-13 23:00:37 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
return ret;
|
2008-10-31 02:25:28 +08:00
|
|
|
}
|
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
/* snapshot-aware defrag */
|
|
|
|
struct sa_defrag_extent_backref {
|
|
|
|
struct rb_node node;
|
|
|
|
struct old_sa_defrag_extent *old;
|
|
|
|
u64 root_id;
|
|
|
|
u64 inum;
|
|
|
|
u64 file_pos;
|
|
|
|
u64 extent_offset;
|
|
|
|
u64 num_bytes;
|
|
|
|
u64 generation;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct old_sa_defrag_extent {
|
|
|
|
struct list_head list;
|
|
|
|
struct new_sa_defrag_extent *new;
|
|
|
|
|
|
|
|
u64 extent_offset;
|
|
|
|
u64 bytenr;
|
|
|
|
u64 offset;
|
|
|
|
u64 len;
|
|
|
|
int count;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct new_sa_defrag_extent {
|
|
|
|
struct rb_root root;
|
|
|
|
struct list_head head;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 file_pos;
|
|
|
|
u64 len;
|
|
|
|
u64 bytenr;
|
|
|
|
u64 disk_len;
|
|
|
|
u8 compress_type;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int backref_comp(struct sa_defrag_extent_backref *b1,
|
|
|
|
struct sa_defrag_extent_backref *b2)
|
|
|
|
{
|
|
|
|
if (b1->root_id < b2->root_id)
|
|
|
|
return -1;
|
|
|
|
else if (b1->root_id > b2->root_id)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (b1->inum < b2->inum)
|
|
|
|
return -1;
|
|
|
|
else if (b1->inum > b2->inum)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (b1->file_pos < b2->file_pos)
|
|
|
|
return -1;
|
|
|
|
else if (b1->file_pos > b2->file_pos)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* [------------------------------] ===> (a range of space)
|
|
|
|
* |<--->| |<---->| =============> (fs/file tree A)
|
|
|
|
* |<---------------------------->| ===> (fs/file tree B)
|
|
|
|
*
|
|
|
|
* A range of space can refer to two file extents in one tree while
|
|
|
|
* refer to only one file extent in another tree.
|
|
|
|
*
|
|
|
|
* So we may process a disk offset more than one time(two extents in A)
|
|
|
|
* and locate at the same extent(one extent in B), then insert two same
|
|
|
|
* backrefs(both refer to the extent in B).
|
|
|
|
*/
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void backref_insert(struct rb_root *root,
|
|
|
|
struct sa_defrag_extent_backref *backref)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &root->rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct sa_defrag_extent_backref *entry;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
|
|
|
|
|
|
|
|
ret = backref_comp(backref, entry);
|
|
|
|
if (ret < 0)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&backref->node, parent, p);
|
|
|
|
rb_insert_color(&backref->node, root);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note the backref might has changed, and in this case we just return 0.
|
|
|
|
*/
|
|
|
|
static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
|
|
|
|
void *ctx)
|
|
|
|
{
|
|
|
|
struct btrfs_file_extent_item *extent;
|
|
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
struct old_sa_defrag_extent *old = ctx;
|
|
|
|
struct new_sa_defrag_extent *new = old->new;
|
|
|
|
struct btrfs_path *path = new->path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct sa_defrag_extent_backref *backref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct inode *inode = new->inode;
|
|
|
|
int slot;
|
|
|
|
int ret;
|
|
|
|
u64 extent_offset;
|
|
|
|
u64 num_bytes;
|
|
|
|
|
|
|
|
if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
|
|
|
|
inum == btrfs_ino(inode))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
key.objectid = root_id;
|
|
|
|
key.type = BTRFS_ROOT_ITEM_KEY;
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
fs_info = BTRFS_I(inode)->root->fs_info;
|
|
|
|
root = btrfs_read_fs_root_no_name(fs_info, &key);
|
|
|
|
if (IS_ERR(root)) {
|
|
|
|
if (PTR_ERR(root) == -ENOENT)
|
|
|
|
return 0;
|
|
|
|
WARN_ON(1);
|
|
|
|
pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
|
|
|
|
inum, offset, root_id);
|
|
|
|
return PTR_ERR(root);
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = inum;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
if (offset > (u64)-1 << 32)
|
|
|
|
key.offset = 0;
|
|
|
|
else
|
|
|
|
key.offset = offset;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
2013-10-31 13:00:08 +08:00
|
|
|
if (WARN_ON(ret < 0))
|
2013-01-29 11:18:40 +08:00
|
|
|
return ret;
|
2013-07-23 00:50:37 +08:00
|
|
|
ret = 0;
|
2013-01-29 11:18:40 +08:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
|
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->slots[0]++;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
|
|
|
|
if (key.objectid > inum)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(leaf, slot,
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
|
|
|
|
continue;
|
|
|
|
|
2013-07-01 22:13:26 +08:00
|
|
|
/*
|
|
|
|
* 'offset' refers to the exact key.offset,
|
|
|
|
* NOT the 'offset' field in btrfs_extent_data_ref, ie.
|
|
|
|
* (key.offset - extent_offset).
|
|
|
|
*/
|
|
|
|
if (key.offset != offset)
|
2013-01-29 11:18:40 +08:00
|
|
|
continue;
|
|
|
|
|
2013-07-01 22:13:26 +08:00
|
|
|
extent_offset = btrfs_file_extent_offset(leaf, extent);
|
2013-01-29 11:18:40 +08:00
|
|
|
num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
|
2013-07-01 22:13:26 +08:00
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
if (extent_offset >= old->extent_offset + old->offset +
|
|
|
|
old->len || extent_offset + num_bytes <=
|
|
|
|
old->extent_offset + old->offset)
|
|
|
|
continue;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
backref = kmalloc(sizeof(*backref), GFP_NOFS);
|
|
|
|
if (!backref) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
backref->root_id = root_id;
|
|
|
|
backref->inum = inum;
|
2013-07-01 22:13:26 +08:00
|
|
|
backref->file_pos = offset;
|
2013-01-29 11:18:40 +08:00
|
|
|
backref->num_bytes = num_bytes;
|
|
|
|
backref->extent_offset = extent_offset;
|
|
|
|
backref->generation = btrfs_file_extent_generation(leaf, extent);
|
|
|
|
backref->old = old;
|
|
|
|
backref_insert(&new->root, backref);
|
|
|
|
old->count++;
|
|
|
|
out:
|
|
|
|
btrfs_release_path(path);
|
|
|
|
WARN_ON(ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline bool record_extent_backrefs(struct btrfs_path *path,
|
|
|
|
struct new_sa_defrag_extent *new)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
|
|
|
|
struct old_sa_defrag_extent *old, *tmp;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
new->path = path;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(old, tmp, &new->head, list) {
|
2013-07-01 22:13:26 +08:00
|
|
|
ret = iterate_inodes_from_logical(old->bytenr +
|
|
|
|
old->extent_offset, fs_info,
|
2013-01-29 11:18:40 +08:00
|
|
|
path, record_one_backref,
|
|
|
|
old);
|
2013-11-06 00:11:40 +08:00
|
|
|
if (ret < 0 && ret != -ENOENT)
|
|
|
|
return false;
|
2013-01-29 11:18:40 +08:00
|
|
|
|
|
|
|
/* no backref to be processed for this extent */
|
|
|
|
if (!old->count) {
|
|
|
|
list_del(&old->list);
|
|
|
|
kfree(old);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (list_empty(&new->head))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int relink_is_mergable(struct extent_buffer *leaf,
|
|
|
|
struct btrfs_file_extent_item *fi,
|
2013-08-02 16:30:40 +08:00
|
|
|
struct new_sa_defrag_extent *new)
|
2013-01-29 11:18:40 +08:00
|
|
|
{
|
2013-08-02 16:30:40 +08:00
|
|
|
if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
|
2013-01-29 11:18:40 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
|
|
|
|
return 0;
|
|
|
|
|
2013-08-02 16:30:40 +08:00
|
|
|
if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (btrfs_file_extent_encryption(leaf, fi) ||
|
2013-01-29 11:18:40 +08:00
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note the backref might has changed, and in this case we just return 0.
|
|
|
|
*/
|
|
|
|
static noinline int relink_extent_backref(struct btrfs_path *path,
|
|
|
|
struct sa_defrag_extent_backref *prev,
|
|
|
|
struct sa_defrag_extent_backref *backref)
|
|
|
|
{
|
|
|
|
struct btrfs_file_extent_item *extent;
|
|
|
|
struct btrfs_file_extent_item *item;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct old_sa_defrag_extent *old = backref->old;
|
|
|
|
struct new_sa_defrag_extent *new = old->new;
|
|
|
|
struct inode *src_inode = new->inode;
|
|
|
|
struct inode *inode;
|
|
|
|
struct extent_state *cached = NULL;
|
|
|
|
int ret = 0;
|
|
|
|
u64 start;
|
|
|
|
u64 len;
|
|
|
|
u64 lock_start;
|
|
|
|
u64 lock_end;
|
|
|
|
bool merge = false;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
if (prev && prev->root_id == backref->root_id &&
|
|
|
|
prev->inum == backref->inum &&
|
|
|
|
prev->file_pos + prev->num_bytes == backref->file_pos)
|
|
|
|
merge = true;
|
|
|
|
|
|
|
|
/* step 1: get root */
|
|
|
|
key.objectid = backref->root_id;
|
|
|
|
key.type = BTRFS_ROOT_ITEM_KEY;
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
fs_info = BTRFS_I(src_inode)->root->fs_info;
|
|
|
|
index = srcu_read_lock(&fs_info->subvol_srcu);
|
|
|
|
|
|
|
|
root = btrfs_read_fs_root_no_name(fs_info, &key);
|
|
|
|
if (IS_ERR(root)) {
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
if (PTR_ERR(root) == -ENOENT)
|
|
|
|
return 0;
|
|
|
|
return PTR_ERR(root);
|
|
|
|
}
|
|
|
|
|
2014-02-08 23:46:35 +08:00
|
|
|
if (btrfs_root_readonly(root)) {
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
/* step 2: get inode */
|
|
|
|
key.objectid = backref->inum;
|
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
srcu_read_unlock(&fs_info->subvol_srcu, index);
|
|
|
|
|
|
|
|
/* step 3: relink backref */
|
|
|
|
lock_start = backref->file_pos;
|
|
|
|
lock_end = backref->file_pos + backref->num_bytes - 1;
|
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
|
2015-12-03 21:30:40 +08:00
|
|
|
&cached);
|
2013-01-29 11:18:40 +08:00
|
|
|
|
|
|
|
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
|
|
|
|
if (ordered) {
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = backref->inum;
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = backref->file_pos;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out_free_path;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
if (btrfs_file_extent_generation(path->nodes[0], extent) !=
|
|
|
|
backref->generation)
|
|
|
|
goto out_free_path;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
start = backref->file_pos;
|
|
|
|
if (backref->extent_offset < old->extent_offset + old->offset)
|
|
|
|
start += old->extent_offset + old->offset -
|
|
|
|
backref->extent_offset;
|
|
|
|
|
|
|
|
len = min(backref->extent_offset + backref->num_bytes,
|
|
|
|
old->extent_offset + old->offset + old->len);
|
|
|
|
len -= max(backref->extent_offset, old->extent_offset + old->offset);
|
|
|
|
|
|
|
|
ret = btrfs_drop_extents(trans, root, inode, start,
|
|
|
|
start + len, 1);
|
|
|
|
if (ret)
|
|
|
|
goto out_free_path;
|
|
|
|
again:
|
|
|
|
key.objectid = btrfs_ino(inode);
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = start;
|
|
|
|
|
2013-03-11 17:20:58 +08:00
|
|
|
path->leave_spinning = 1;
|
2013-01-29 11:18:40 +08:00
|
|
|
if (merge) {
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
u64 extent_len;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
|
2014-01-23 13:41:09 +08:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
|
2013-01-29 11:18:40 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_path;
|
|
|
|
|
|
|
|
path->slots[0]--;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
extent_len = btrfs_file_extent_num_bytes(leaf, fi);
|
|
|
|
|
2013-08-02 16:30:40 +08:00
|
|
|
if (extent_len + found_key.offset == start &&
|
|
|
|
relink_is_mergable(leaf, fi, new)) {
|
2013-01-29 11:18:40 +08:00
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi,
|
|
|
|
extent_len + len);
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
inode_add_bytes(inode, len);
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
goto out_free_path;
|
|
|
|
} else {
|
|
|
|
merge = false;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
sizeof(*extent));
|
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
|
|
|
|
btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
|
|
|
|
btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
|
|
|
|
btrfs_set_file_extent_num_bytes(leaf, item, len);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
|
|
|
|
btrfs_set_file_extent_generation(leaf, item, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
|
|
|
|
btrfs_set_file_extent_compression(leaf, item, new->compress_type);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, item, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, item, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
inode_add_bytes(inode, len);
|
2013-03-11 17:20:58 +08:00
|
|
|
btrfs_release_path(path);
|
2013-01-29 11:18:40 +08:00
|
|
|
|
|
|
|
ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
|
|
|
|
new->disk_len, 0,
|
|
|
|
backref->root_id, backref->inum,
|
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 14:52:54 +08:00
|
|
|
new->file_pos); /* start - extent_offset */
|
2013-01-29 11:18:40 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 1;
|
|
|
|
out_free_path:
|
|
|
|
btrfs_release_path(path);
|
2013-03-11 17:20:58 +08:00
|
|
|
path->leave_spinning = 0;
|
2013-01-29 11:18:40 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
out_unlock:
|
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
|
|
|
|
&cached, GFP_NOFS);
|
|
|
|
iput(inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-29 10:45:05 +08:00
|
|
|
static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
|
|
|
|
{
|
|
|
|
struct old_sa_defrag_extent *old, *tmp;
|
|
|
|
|
|
|
|
if (!new)
|
|
|
|
return;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(old, tmp, &new->head, list) {
|
|
|
|
kfree(old);
|
|
|
|
}
|
|
|
|
kfree(new);
|
|
|
|
}
|
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
static void relink_file_extents(struct new_sa_defrag_extent *new)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct sa_defrag_extent_backref *backref;
|
|
|
|
struct sa_defrag_extent_backref *prev = NULL;
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct rb_node *node;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
inode = new->inode;
|
|
|
|
root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!record_extent_backrefs(path, new)) {
|
|
|
|
btrfs_free_path(path);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
node = rb_first(&new->root);
|
|
|
|
if (!node)
|
|
|
|
break;
|
|
|
|
rb_erase(node, &new->root);
|
|
|
|
|
|
|
|
backref = rb_entry(node, struct sa_defrag_extent_backref, node);
|
|
|
|
|
|
|
|
ret = relink_extent_backref(path, prev, backref);
|
|
|
|
WARN_ON(ret < 0);
|
|
|
|
|
|
|
|
kfree(prev);
|
|
|
|
|
|
|
|
if (ret == 1)
|
|
|
|
prev = backref;
|
|
|
|
else
|
|
|
|
prev = NULL;
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
kfree(prev);
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
out:
|
2013-10-29 10:45:05 +08:00
|
|
|
free_sa_defrag_extent(new);
|
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
atomic_dec(&root->fs_info->defrag_running);
|
|
|
|
wake_up(&root->fs_info->transaction_wait);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct new_sa_defrag_extent *
|
|
|
|
record_old_file_extents(struct inode *inode,
|
|
|
|
struct btrfs_ordered_extent *ordered)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2013-10-29 10:45:05 +08:00
|
|
|
struct old_sa_defrag_extent *old;
|
2013-01-29 11:18:40 +08:00
|
|
|
struct new_sa_defrag_extent *new;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
new = kmalloc(sizeof(*new), GFP_NOFS);
|
|
|
|
if (!new)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
new->inode = inode;
|
|
|
|
new->file_pos = ordered->file_offset;
|
|
|
|
new->len = ordered->len;
|
|
|
|
new->bytenr = ordered->start;
|
|
|
|
new->disk_len = ordered->disk_len;
|
|
|
|
new->compress_type = ordered->compress_type;
|
|
|
|
new->root = RB_ROOT;
|
|
|
|
INIT_LIST_HEAD(&new->head);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
goto out_kfree;
|
|
|
|
|
|
|
|
key.objectid = btrfs_ino(inode);
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = new->file_pos;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_path;
|
|
|
|
if (ret > 0 && path->slots[0] > 0)
|
|
|
|
path->slots[0]--;
|
|
|
|
|
|
|
|
/* find out all the old extents for the file range */
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_file_extent_item *extent;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
int slot;
|
|
|
|
u64 num_bytes;
|
|
|
|
u64 offset;
|
|
|
|
u64 end;
|
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 extent_offset;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
|
|
|
|
if (slot >= btrfs_header_nritems(l)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
2013-10-29 10:45:05 +08:00
|
|
|
goto out_free_path;
|
2013-01-29 11:18:40 +08:00
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(l, &key, slot);
|
|
|
|
|
|
|
|
if (key.objectid != btrfs_ino(inode))
|
|
|
|
break;
|
|
|
|
if (key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
break;
|
|
|
|
if (key.offset >= new->file_pos + new->len)
|
|
|
|
break;
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
num_bytes = btrfs_file_extent_num_bytes(l, extent);
|
|
|
|
if (key.offset + num_bytes < new->file_pos)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
|
|
|
|
if (!disk_bytenr)
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
extent_offset = btrfs_file_extent_offset(l, extent);
|
|
|
|
|
|
|
|
old = kmalloc(sizeof(*old), GFP_NOFS);
|
|
|
|
if (!old)
|
2013-10-29 10:45:05 +08:00
|
|
|
goto out_free_path;
|
2013-01-29 11:18:40 +08:00
|
|
|
|
|
|
|
offset = max(new->file_pos, key.offset);
|
|
|
|
end = min(new->file_pos + new->len, key.offset + num_bytes);
|
|
|
|
|
|
|
|
old->bytenr = disk_bytenr;
|
|
|
|
old->extent_offset = extent_offset;
|
|
|
|
old->offset = offset - key.offset;
|
|
|
|
old->len = end - offset;
|
|
|
|
old->new = new;
|
|
|
|
old->count = 0;
|
|
|
|
list_add_tail(&old->list, &new->head);
|
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
atomic_inc(&root->fs_info->defrag_running);
|
|
|
|
|
|
|
|
return new;
|
|
|
|
|
|
|
|
out_free_path:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
out_kfree:
|
2013-10-29 10:45:05 +08:00
|
|
|
free_sa_defrag_extent(new);
|
2013-01-29 11:18:40 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
|
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_block_group_cache *cache;
|
|
|
|
|
|
|
|
cache = btrfs_lookup_block_group(root->fs_info, start);
|
|
|
|
ASSERT(cache);
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
cache->delalloc_bytes -= len;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/* as ordered data IO finishes, this gets called so we can finish
|
|
|
|
* an ordered extent if the range of bytes in the file it covers are
|
|
|
|
* fully written.
|
|
|
|
*/
|
2012-05-03 02:00:54 +08:00
|
|
|
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
|
2008-07-18 00:53:50 +08:00
|
|
|
{
|
2012-05-03 02:00:54 +08:00
|
|
|
struct inode *inode = ordered_extent->inode;
|
2008-07-18 00:53:50 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 22:48:47 +08:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2008-07-18 00:53:50 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-02-04 03:33:23 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
2013-01-29 11:18:40 +08:00
|
|
|
struct new_sa_defrag_extent *new = NULL;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type = 0;
|
2013-08-30 01:57:21 +08:00
|
|
|
int ret = 0;
|
|
|
|
u64 logical_len = ordered_extent->len;
|
2011-04-20 10:33:24 +08:00
|
|
|
bool nolock;
|
2013-08-30 01:57:21 +08:00
|
|
|
bool truncated = false;
|
2008-07-18 00:53:50 +08:00
|
|
|
|
2012-07-10 19:28:39 +08:00
|
|
|
nolock = btrfs_is_free_space_inode(inode);
|
2010-07-03 00:14:14 +08:00
|
|
|
|
2012-05-03 02:00:54 +08:00
|
|
|
if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
Btrfs: cleanup the read failure record after write or when the inode is freeing
After the data is written successfully, we should cleanup the read failure record
in that range because
- If we set data COW for the file, the range that the failure record pointed to is
mapped to a new place, so it is invalid.
- If we set no data COW for the file, and if there is no error during writting,
the corrupted data is corrected, so the failure record can be removed. And if
some errors happen on the mirrors, we also needn't worry about it because the
failure record will be recreated if we read the same place again.
Sometimes, we may fail to correct the data, so the failure records will be left
in the tree, we need free them when we free the inode or the memory leak happens.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-12 18:44:04 +08:00
|
|
|
btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
|
|
|
ordered_extent->len - 1);
|
|
|
|
|
2013-08-30 01:57:21 +08:00
|
|
|
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
|
|
|
|
truncated = true;
|
|
|
|
logical_len = ordered_extent->truncated_len;
|
|
|
|
/* Truncated the entire extent, don't bother adding */
|
|
|
|
if (!logical_len)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2009-11-12 17:34:21 +08:00
|
|
|
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
|
2015-09-08 17:25:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For mwrite(mmap + memset to write) case, we still reserve
|
|
|
|
* space for NOCOW range.
|
|
|
|
* As NOCOW won't cause a new delayed ref, just free the space
|
|
|
|
*/
|
|
|
|
btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
|
|
|
|
ordered_extent->len);
|
2012-11-09 23:53:21 +08:00
|
|
|
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
|
|
|
|
if (nolock)
|
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
|
|
|
else
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
goto out;
|
2009-11-12 17:34:21 +08:00
|
|
|
}
|
2012-11-09 23:53:21 +08:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
|
|
|
ret = btrfs_update_inode_fallback(trans, root, inode);
|
|
|
|
if (ret) /* -ENOMEM or corruption */
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2009-11-12 17:34:21 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
lock_extent_bits(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
2015-12-03 21:30:40 +08:00
|
|
|
&cached_state);
|
2008-07-18 00:53:50 +08:00
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
ret = test_range_bit(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
|
|
|
EXTENT_DEFRAG, 1, cached_state);
|
|
|
|
if (ret) {
|
|
|
|
u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
|
2014-01-30 05:05:30 +08:00
|
|
|
if (0 && last_snapshot >= BTRFS_I(inode)->generation)
|
2013-01-29 11:18:40 +08:00
|
|
|
/* the inode is shared */
|
|
|
|
new = record_old_file_extents(inode, ordered_extent);
|
|
|
|
|
|
|
|
clear_extent_bit(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
|
|
|
EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
|
|
|
|
}
|
|
|
|
|
2010-07-03 00:14:14 +08:00
|
|
|
if (nolock)
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2010-07-03 00:14:14 +08:00
|
|
|
else
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2014-05-23 07:18:52 +08:00
|
|
|
|
2010-05-16 22:48:47 +08:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2009-11-12 17:34:21 +08:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type = ordered_extent->compress_type;
|
2008-10-31 02:25:28 +08:00
|
|
|
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
|
2010-12-17 14:21:50 +08:00
|
|
|
BUG_ON(compress_type);
|
2009-11-12 17:34:08 +08:00
|
|
|
ret = btrfs_mark_extent_written(trans, inode,
|
2008-10-31 02:25:28 +08:00
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
2013-08-30 01:57:21 +08:00
|
|
|
logical_len);
|
2008-10-31 02:25:28 +08:00
|
|
|
} else {
|
2010-06-22 02:48:16 +08:00
|
|
|
BUG_ON(root == root->fs_info->tree_root);
|
2008-10-31 02:25:28 +08:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->start,
|
|
|
|
ordered_extent->disk_len,
|
2013-08-30 01:57:21 +08:00
|
|
|
logical_len, logical_len,
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type, 0, 0,
|
2008-10-31 02:25:28 +08:00
|
|
|
BTRFS_FILE_EXTENT_REG);
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
if (!ret)
|
|
|
|
btrfs_release_delalloc_bytes(root,
|
|
|
|
ordered_extent->start,
|
|
|
|
ordered_extent->disk_len);
|
2008-10-31 02:25:28 +08:00
|
|
|
}
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
|
|
|
|
ordered_extent->file_offset, ordered_extent->len,
|
|
|
|
trans->transid);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2012-05-03 02:00:54 +08:00
|
|
|
goto out_unlock;
|
2012-03-12 23:03:00 +08:00
|
|
|
}
|
2010-02-04 03:33:23 +08:00
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
add_pending_csums(trans, inode, ordered_extent->file_offset,
|
|
|
|
&ordered_extent->list);
|
|
|
|
|
2012-11-09 23:53:21 +08:00
|
|
|
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
|
|
|
|
ret = btrfs_update_inode_fallback(trans, root, inode);
|
|
|
|
if (ret) { /* -ENOMEM or corruption */
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out_unlock;
|
2011-04-06 07:25:36 +08:00
|
|
|
}
|
|
|
|
ret = 0;
|
2012-05-03 02:00:54 +08:00
|
|
|
out_unlock:
|
|
|
|
unlock_extent_cached(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
|
|
|
ordered_extent->len - 1, &cached_state, GFP_NOFS);
|
2009-11-12 17:34:21 +08:00
|
|
|
out:
|
2011-10-06 20:58:24 +08:00
|
|
|
if (root != root->fs_info->tree_root)
|
2010-07-03 00:14:14 +08:00
|
|
|
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
|
2012-09-20 15:51:59 +08:00
|
|
|
if (trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-07-03 00:14:14 +08:00
|
|
|
|
2013-08-30 01:57:21 +08:00
|
|
|
if (ret || truncated) {
|
|
|
|
u64 start, end;
|
|
|
|
|
|
|
|
if (truncated)
|
|
|
|
start = ordered_extent->file_offset + logical_len;
|
|
|
|
else
|
|
|
|
start = ordered_extent->file_offset;
|
|
|
|
end = ordered_extent->file_offset + ordered_extent->len - 1;
|
|
|
|
clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
|
|
|
|
|
|
|
|
/* Drop the cache for the part of the extent we didn't write. */
|
|
|
|
btrfs_drop_extent_cache(inode, start, end, 0);
|
2012-05-03 02:00:54 +08:00
|
|
|
|
2013-02-01 03:58:00 +08:00
|
|
|
/*
|
|
|
|
* If the ordered extent had an IOERR or something else went
|
|
|
|
* wrong we need to return the space for this ordered extent
|
2013-08-30 01:57:21 +08:00
|
|
|
* back to the allocator. We only free the extent in the
|
|
|
|
* truncated case if we didn't write out the extent at all.
|
2013-02-01 03:58:00 +08:00
|
|
|
*/
|
2013-08-30 01:57:21 +08:00
|
|
|
if ((ret || !logical_len) &&
|
|
|
|
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
|
2013-02-01 03:58:00 +08:00
|
|
|
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
|
|
|
|
btrfs_free_reserved_extent(root, ordered_extent->start,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
ordered_extent->disk_len, 1);
|
2013-02-01 03:58:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2012-05-03 02:00:54 +08:00
|
|
|
/*
|
2012-06-18 12:14:23 +08:00
|
|
|
* This needs to be done to make sure anybody waiting knows we are done
|
|
|
|
* updating everything for this ordered extent.
|
2012-05-03 02:00:54 +08:00
|
|
|
*/
|
|
|
|
btrfs_remove_ordered_extent(inode, ordered_extent);
|
|
|
|
|
2013-01-29 11:18:40 +08:00
|
|
|
/* for snapshot-aware defrag */
|
2013-10-29 10:45:05 +08:00
|
|
|
if (new) {
|
|
|
|
if (ret) {
|
|
|
|
free_sa_defrag_extent(new);
|
|
|
|
atomic_dec(&root->fs_info->defrag_running);
|
|
|
|
} else {
|
|
|
|
relink_file_extents(new);
|
|
|
|
}
|
|
|
|
}
|
2013-01-29 11:18:40 +08:00
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
/* once for us */
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
/* once for the tree */
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
|
2012-05-03 02:00:54 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void finish_ordered_fn(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct btrfs_ordered_extent *ordered_extent;
|
|
|
|
ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
|
|
|
|
btrfs_finish_ordered_io(ordered_extent);
|
2008-07-18 00:53:50 +08:00
|
|
|
}
|
|
|
|
|
2008-12-02 22:54:17 +08:00
|
|
|
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
|
2008-07-18 23:56:15 +08:00
|
|
|
struct extent_state *state, int uptodate)
|
|
|
|
{
|
2012-05-03 02:00:54 +08:00
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_ordered_extent *ordered_extent = NULL;
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
struct btrfs_workqueue *wq;
|
|
|
|
btrfs_work_func_t func;
|
2012-05-03 02:00:54 +08:00
|
|
|
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
|
|
|
|
|
2009-09-03 04:53:46 +08:00
|
|
|
ClearPagePrivate2(page);
|
2012-05-03 02:00:54 +08:00
|
|
|
if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
|
|
|
|
end - start + 1, uptodate))
|
|
|
|
return 0;
|
|
|
|
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
if (btrfs_is_free_space_inode(inode)) {
|
|
|
|
wq = root->fs_info->endio_freespace_worker;
|
|
|
|
func = btrfs_freespace_write_helper;
|
|
|
|
} else {
|
|
|
|
wq = root->fs_info->endio_write_workers;
|
|
|
|
func = btrfs_endio_write_helper;
|
|
|
|
}
|
2012-05-03 02:00:54 +08:00
|
|
|
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
|
|
|
|
NULL);
|
|
|
|
btrfs_queue_work(wq, &ordered_extent->work);
|
2012-05-03 02:00:54 +08:00
|
|
|
|
|
|
|
return 0;
|
2008-07-18 23:56:15 +08:00
|
|
|
}
|
|
|
|
|
2014-09-12 18:43:55 +08:00
|
|
|
static int __readpage_endio_check(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio,
|
|
|
|
int icsum, struct page *page,
|
|
|
|
int pgoff, u64 start, size_t len)
|
|
|
|
{
|
|
|
|
char *kaddr;
|
|
|
|
u32 csum_expected;
|
|
|
|
u32 csum = ~(u32)0;
|
|
|
|
|
|
|
|
csum_expected = *(((u32 *)io_bio->csum) + icsum);
|
|
|
|
|
|
|
|
kaddr = kmap_atomic(page);
|
|
|
|
csum = btrfs_csum_data(kaddr + pgoff, csum, len);
|
|
|
|
btrfs_csum_final(csum, (char *)&csum);
|
|
|
|
if (csum != csum_expected)
|
|
|
|
goto zeroit;
|
|
|
|
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return 0;
|
|
|
|
zeroit:
|
2015-10-08 17:01:36 +08:00
|
|
|
btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
|
|
|
|
"csum failed ino %llu off %llu csum %u expected csum %u",
|
2014-09-12 18:43:55 +08:00
|
|
|
btrfs_ino(inode), start, csum, csum_expected);
|
|
|
|
memset(kaddr + pgoff, 1, len);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
if (csum_expected == 0)
|
|
|
|
return 0;
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* when reads are done, we need to check csums to verify the data is correct
|
2011-07-22 21:41:52 +08:00
|
|
|
* if there's a match, we allow the bio to finish. If not, the code in
|
|
|
|
* extent_io.c will try to find good copies for us.
|
2008-09-30 03:18:18 +08:00
|
|
|
*/
|
2013-07-25 19:22:34 +08:00
|
|
|
static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
|
|
|
|
u64 phy_offset, struct page *page,
|
|
|
|
u64 start, u64 end, int mirror)
|
2007-08-30 20:50:51 +08:00
|
|
|
{
|
2012-12-21 17:17:45 +08:00
|
|
|
size_t offset = start - page_offset(page);
|
2007-08-30 20:50:51 +08:00
|
|
|
struct inode *inode = page->mapping->host;
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2007-10-16 04:22:25 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-01-25 05:13:08 +08:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-09 05:58:54 +08:00
|
|
|
if (PageChecked(page)) {
|
|
|
|
ClearPageChecked(page);
|
2014-09-12 18:43:55 +08:00
|
|
|
return 0;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-09 05:58:54 +08:00
|
|
|
}
|
2009-04-17 16:37:41 +08:00
|
|
|
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
2014-09-12 18:43:55 +08:00
|
|
|
return 0;
|
2008-12-12 23:03:38 +08:00
|
|
|
|
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
|
2009-09-03 03:22:30 +08:00
|
|
|
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
|
2008-12-12 23:03:38 +08:00
|
|
|
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
|
|
|
|
GFP_NOFS);
|
2007-12-15 04:30:32 +08:00
|
|
|
return 0;
|
2008-12-12 23:03:38 +08:00
|
|
|
}
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-09 05:58:54 +08:00
|
|
|
|
2013-07-25 19:22:34 +08:00
|
|
|
phy_offset >>= inode->i_sb->s_blocksize_bits;
|
2014-09-12 18:43:55 +08:00
|
|
|
return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
|
|
|
|
start, (size_t)(end - start + 1));
|
2007-08-30 20:50:51 +08:00
|
|
|
}
|
2007-08-28 04:49:44 +08:00
|
|
|
|
2009-11-12 17:36:34 +08:00
|
|
|
void btrfs_add_delayed_iput(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
2015-11-19 21:15:51 +08:00
|
|
|
struct btrfs_inode *binode = BTRFS_I(inode);
|
2009-11-12 17:36:34 +08:00
|
|
|
|
|
|
|
if (atomic_add_unless(&inode->i_count, -1, 1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2015-11-19 21:15:51 +08:00
|
|
|
if (binode->delayed_iput_count == 0) {
|
|
|
|
ASSERT(list_empty(&binode->delayed_iput));
|
|
|
|
list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
|
|
|
|
} else {
|
|
|
|
binode->delayed_iput_count++;
|
|
|
|
}
|
2009-11-12 17:36:34 +08:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_run_delayed_iputs(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2015-11-19 21:15:51 +08:00
|
|
|
while (!list_empty(&fs_info->delayed_iputs)) {
|
|
|
|
struct btrfs_inode *inode;
|
|
|
|
|
|
|
|
inode = list_first_entry(&fs_info->delayed_iputs,
|
|
|
|
struct btrfs_inode, delayed_iput);
|
|
|
|
if (inode->delayed_iput_count) {
|
|
|
|
inode->delayed_iput_count--;
|
|
|
|
list_move_tail(&inode->delayed_iput,
|
|
|
|
&fs_info->delayed_iputs);
|
|
|
|
} else {
|
|
|
|
list_del_init(&inode->delayed_iput);
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
iput(&inode->vfs_inode);
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
2009-11-12 17:36:34 +08:00
|
|
|
}
|
2015-11-19 21:15:51 +08:00
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
2009-11-12 17:36:34 +08:00
|
|
|
}
|
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
/*
|
2011-11-29 12:31:00 +08:00
|
|
|
* This is called in transaction commit time. If there are no orphan
|
2010-05-16 22:49:58 +08:00
|
|
|
* files in the subvolume, it removes orphan item and frees block_rsv
|
|
|
|
* structure.
|
|
|
|
*/
|
|
|
|
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
2011-12-03 04:44:12 +08:00
|
|
|
struct btrfs_block_rsv *block_rsv;
|
2010-05-16 22:49:58 +08:00
|
|
|
int ret;
|
|
|
|
|
2012-05-24 02:26:42 +08:00
|
|
|
if (atomic_read(&root->orphan_inodes) ||
|
2010-05-16 22:49:58 +08:00
|
|
|
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
|
|
|
|
return;
|
|
|
|
|
2011-12-03 04:44:12 +08:00
|
|
|
spin_lock(&root->orphan_lock);
|
2012-05-24 02:26:42 +08:00
|
|
|
if (atomic_read(&root->orphan_inodes)) {
|
2011-12-03 04:44:12 +08:00
|
|
|
spin_unlock(&root->orphan_lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
|
|
|
|
spin_unlock(&root->orphan_lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
block_rsv = root->orphan_block_rsv;
|
|
|
|
root->orphan_block_rsv = NULL;
|
|
|
|
spin_unlock(&root->orphan_lock);
|
|
|
|
|
2014-04-02 19:51:05 +08:00
|
|
|
if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
|
2010-05-16 22:49:58 +08:00
|
|
|
btrfs_root_refs(&root->root_item) > 0) {
|
|
|
|
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
|
|
|
|
root->root_key.objectid);
|
2013-08-14 02:10:08 +08:00
|
|
|
if (ret)
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
else
|
2014-04-02 19:51:05 +08:00
|
|
|
clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
|
|
|
|
&root->state);
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
|
|
|
|
2011-12-03 04:44:12 +08:00
|
|
|
if (block_rsv) {
|
|
|
|
WARN_ON(block_rsv->size > 0);
|
|
|
|
btrfs_free_block_rsv(root, block_rsv);
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-25 00:17:14 +08:00
|
|
|
/*
|
|
|
|
* This creates an orphan entry for the given inode in case something goes
|
|
|
|
* wrong in the middle of an unlink/truncate.
|
2010-05-16 22:49:58 +08:00
|
|
|
*
|
|
|
|
* NOTE: caller of this function should reserve 5 units of metadata for
|
|
|
|
* this function.
|
2008-07-25 00:17:14 +08:00
|
|
|
*/
|
|
|
|
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 22:49:58 +08:00
|
|
|
struct btrfs_block_rsv *block_rsv = NULL;
|
|
|
|
int reserve = 0;
|
|
|
|
int insert = 0;
|
|
|
|
int ret;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
if (!root->orphan_block_rsv) {
|
2012-09-06 18:02:28 +08:00
|
|
|
block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
|
2011-07-19 15:27:20 +08:00
|
|
|
if (!block_rsv)
|
|
|
|
return -ENOMEM;
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
spin_lock(&root->orphan_lock);
|
|
|
|
if (!root->orphan_block_rsv) {
|
|
|
|
root->orphan_block_rsv = block_rsv;
|
|
|
|
} else if (block_rsv) {
|
|
|
|
btrfs_free_block_rsv(root, block_rsv);
|
|
|
|
block_rsv = NULL;
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
|
|
|
|
2012-05-24 02:26:42 +08:00
|
|
|
if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
|
|
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
2010-05-16 22:49:58 +08:00
|
|
|
#if 0
|
|
|
|
/*
|
|
|
|
* For proper ENOSPC handling, we should do orphan
|
|
|
|
* cleanup when mounting. But this introduces backward
|
|
|
|
* compatibility issue.
|
|
|
|
*/
|
|
|
|
if (!xchg(&root->orphan_item_inserted, 1))
|
|
|
|
insert = 2;
|
|
|
|
else
|
|
|
|
insert = 1;
|
|
|
|
#endif
|
|
|
|
insert = 1;
|
2012-08-29 12:13:02 +08:00
|
|
|
atomic_inc(&root->orphan_inodes);
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
|
|
|
|
2012-05-24 02:13:11 +08:00
|
|
|
if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2010-05-16 22:49:58 +08:00
|
|
|
reserve = 1;
|
|
|
|
spin_unlock(&root->orphan_lock);
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
/* grab metadata reservation from transaction handle */
|
|
|
|
if (reserve) {
|
|
|
|
ret = btrfs_orphan_reserve_metadata(trans, inode);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
/* insert an orphan item to track this unlinked/truncated file */
|
|
|
|
if (insert >= 1) {
|
2011-04-20 10:31:50 +08:00
|
|
|
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
|
2013-08-14 02:10:08 +08:00
|
|
|
if (ret) {
|
2013-09-23 04:54:55 +08:00
|
|
|
atomic_dec(&root->orphan_inodes);
|
2013-08-14 02:10:08 +08:00
|
|
|
if (reserve) {
|
|
|
|
clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
btrfs_orphan_release_metadata(inode);
|
|
|
|
}
|
|
|
|
if (ret != -EEXIST) {
|
2013-08-22 03:54:00 +08:00
|
|
|
clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2013-08-14 02:10:08 +08:00
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
}
|
|
|
|
ret = 0;
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* insert an orphan item to track subvolume contains orphan files */
|
|
|
|
if (insert >= 2) {
|
|
|
|
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
|
|
|
|
root->root_key.objectid);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret && ret != -EEXIST) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
|
|
|
return 0;
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have done the truncate/delete so we can go ahead and remove the orphan
|
|
|
|
* item for this particular inode.
|
|
|
|
*/
|
2013-04-26 04:41:01 +08:00
|
|
|
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *inode)
|
2008-07-25 00:17:14 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 22:49:58 +08:00
|
|
|
int delete_item = 0;
|
|
|
|
int release_rsv = 0;
|
2008-07-25 00:17:14 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
spin_lock(&root->orphan_lock);
|
2012-05-24 02:26:42 +08:00
|
|
|
if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2010-05-16 22:49:58 +08:00
|
|
|
delete_item = 1;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2012-05-24 02:13:11 +08:00
|
|
|
if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2010-05-16 22:49:58 +08:00
|
|
|
release_rsv = 1;
|
|
|
|
spin_unlock(&root->orphan_lock);
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2013-09-23 04:54:55 +08:00
|
|
|
if (delete_item) {
|
2012-05-24 02:26:42 +08:00
|
|
|
atomic_dec(&root->orphan_inodes);
|
2013-09-23 04:54:55 +08:00
|
|
|
if (trans)
|
|
|
|
ret = btrfs_del_orphan_item(trans, root,
|
|
|
|
btrfs_ino(inode));
|
2012-05-24 02:26:42 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2013-09-23 04:54:55 +08:00
|
|
|
if (release_rsv)
|
|
|
|
btrfs_orphan_release_metadata(inode);
|
|
|
|
|
2013-08-14 02:10:08 +08:00
|
|
|
return ret;
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this cleans up any orphans that may be left on the list from the last use
|
|
|
|
* of this root.
|
|
|
|
*/
|
2011-02-01 05:22:42 +08:00
|
|
|
int btrfs_orphan_cleanup(struct btrfs_root *root)
|
2008-07-25 00:17:14 +08:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct inode *inode;
|
2011-09-27 03:55:20 +08:00
|
|
|
u64 last_objectid = 0;
|
2008-07-25 00:17:14 +08:00
|
|
|
int ret = 0, nr_unlink = 0, nr_truncate = 0;
|
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
|
2011-02-01 05:22:42 +08:00
|
|
|
return 0;
|
2009-11-12 17:34:40 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-02-01 05:22:42 +08:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_BACK;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
|
|
|
key.objectid = BTRFS_ORPHAN_OBJECTID;
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = BTRFS_ORPHAN_ITEM_KEY;
|
2008-07-25 00:17:14 +08:00
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
2011-02-01 05:22:42 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* if ret == 0 means we found what we were searching for, which
|
2011-03-31 09:57:33 +08:00
|
|
|
* is weird, but possible, so only screw with path if we didn't
|
2008-07-25 00:17:14 +08:00
|
|
|
* find the key and see if we have stuff that matches
|
|
|
|
*/
|
|
|
|
if (ret > 0) {
|
2011-02-01 05:22:42 +08:00
|
|
|
ret = 0;
|
2008-07-25 00:17:14 +08:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
break;
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pull out the item */
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
|
|
/* make sure the item matches what we want */
|
|
|
|
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
|
|
|
|
break;
|
2014-06-05 00:41:45 +08:00
|
|
|
if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
|
2008-07-25 00:17:14 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* release the path since we're done with it */
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2008-07-25 00:17:14 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this is where we are basically btrfs_lookup, without the
|
|
|
|
* crossing root thing. we store the inode number in the
|
|
|
|
* offset of the orphan item.
|
|
|
|
*/
|
2011-09-27 03:55:20 +08:00
|
|
|
|
|
|
|
if (found_key.offset == last_objectid) {
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_err(root->fs_info,
|
|
|
|
"Error removing orphan entry, stopping orphan cleanup");
|
2011-09-27 03:55:20 +08:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
last_objectid = found_key.offset;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
found_key.objectid = found_key.offset;
|
|
|
|
found_key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
found_key.offset = 0;
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-05 01:38:27 +08:00
|
|
|
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
|
2013-07-15 09:50:32 +08:00
|
|
|
ret = PTR_ERR_OR_ZERO(inode);
|
2011-09-22 04:55:59 +08:00
|
|
|
if (ret && ret != -ESTALE)
|
2011-02-01 05:22:42 +08:00
|
|
|
goto out;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2011-12-15 09:12:02 +08:00
|
|
|
if (ret == -ESTALE && root == root->fs_info->tree_root) {
|
|
|
|
struct btrfs_root *dead_root;
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
int is_dead_root = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this is an orphan in the tree root. Currently these
|
|
|
|
* could come from 2 sources:
|
|
|
|
* a) a snapshot deletion in progress
|
|
|
|
* b) a free space cache inode
|
|
|
|
* We need to distinguish those two, as the snapshot
|
|
|
|
* orphan must not get deleted.
|
|
|
|
* find_dead_roots already ran before us, so if this
|
|
|
|
* is a snapshot deletion, we should find the root
|
|
|
|
* in the dead_roots list
|
|
|
|
*/
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
list_for_each_entry(dead_root, &fs_info->dead_roots,
|
|
|
|
root_list) {
|
|
|
|
if (dead_root->root_key.objectid ==
|
|
|
|
found_key.objectid) {
|
|
|
|
is_dead_root = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
if (is_dead_root) {
|
|
|
|
/* prevent this orphan from being found again */
|
|
|
|
key.offset = found_key.objectid - 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
/*
|
2011-09-22 04:55:59 +08:00
|
|
|
* Inode is already gone but the orphan item is still there,
|
|
|
|
* kill the orphan item.
|
2008-07-25 00:17:14 +08:00
|
|
|
*/
|
2011-09-22 04:55:59 +08:00
|
|
|
if (ret == -ESTALE) {
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2011-02-01 05:22:42 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_debug(root->fs_info, "auto deleting %Lu",
|
|
|
|
found_key.objectid);
|
2011-09-22 04:55:59 +08:00
|
|
|
ret = btrfs_del_orphan_item(trans, root,
|
|
|
|
found_key.objectid);
|
2008-09-26 22:05:38 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2013-08-14 02:10:08 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-25 00:17:14 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-09-22 04:55:59 +08:00
|
|
|
/*
|
|
|
|
* add this inode to the orphan list so btrfs_orphan_del does
|
|
|
|
* the proper thing when we hit it
|
|
|
|
*/
|
2012-05-24 02:26:42 +08:00
|
|
|
set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2013-02-02 04:57:47 +08:00
|
|
|
atomic_inc(&root->orphan_inodes);
|
2011-09-22 04:55:59 +08:00
|
|
|
|
2008-07-25 00:17:14 +08:00
|
|
|
/* if we have links, this was a truncate, lets do that */
|
|
|
|
if (inode->i_nlink) {
|
2013-10-31 13:00:08 +08:00
|
|
|
if (WARN_ON(!S_ISREG(inode->i_mode))) {
|
2011-02-01 04:30:16 +08:00
|
|
|
iput(inode);
|
|
|
|
continue;
|
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
nr_truncate++;
|
2013-01-08 06:03:21 +08:00
|
|
|
|
|
|
|
/* 1 for the orphan item deletion. */
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
2013-06-04 04:51:23 +08:00
|
|
|
iput(inode);
|
2013-01-08 06:03:21 +08:00
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_orphan_add(trans, inode);
|
|
|
|
btrfs_end_transaction(trans, root);
|
2013-06-04 04:51:23 +08:00
|
|
|
if (ret) {
|
|
|
|
iput(inode);
|
2013-01-08 06:03:21 +08:00
|
|
|
goto out;
|
2013-06-04 04:51:23 +08:00
|
|
|
}
|
2013-01-08 06:03:21 +08:00
|
|
|
|
2011-02-01 05:22:42 +08:00
|
|
|
ret = btrfs_truncate(inode);
|
2013-02-08 05:27:28 +08:00
|
|
|
if (ret)
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
2008-07-25 00:17:14 +08:00
|
|
|
} else {
|
|
|
|
nr_unlink++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* this will do delete_inode and everything for us */
|
|
|
|
iput(inode);
|
2011-02-01 05:22:42 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
2011-11-11 09:45:05 +08:00
|
|
|
/* release the path since we're done with it */
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
|
|
|
|
|
|
|
|
if (root->orphan_block_rsv)
|
|
|
|
btrfs_block_rsv_release(root, root->orphan_block_rsv,
|
|
|
|
(u64)-1);
|
|
|
|
|
2014-04-02 19:51:05 +08:00
|
|
|
if (root->orphan_block_rsv ||
|
|
|
|
test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-02-01 05:22:42 +08:00
|
|
|
if (!IS_ERR(trans))
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
|
|
|
if (nr_unlink)
|
2013-03-20 21:31:27 +08:00
|
|
|
btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
|
2008-07-25 00:17:14 +08:00
|
|
|
if (nr_truncate)
|
2013-03-20 21:31:27 +08:00
|
|
|
btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
|
2011-02-01 05:22:42 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (ret)
|
2014-12-20 01:38:37 +08:00
|
|
|
btrfs_err(root->fs_info,
|
2013-03-20 06:41:23 +08:00
|
|
|
"could not do orphan cleanup %d", ret);
|
2011-02-01 05:22:42 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
|
|
|
|
2009-04-27 23:47:50 +08:00
|
|
|
/*
|
|
|
|
* very simple check to peek ahead in the leaf looking for xattrs. If we
|
|
|
|
* don't find any xattrs, we know there can't be any acls.
|
|
|
|
*
|
|
|
|
* slot is the slot the inode is in, objectid is the objectid of the inode
|
|
|
|
*/
|
|
|
|
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
int slot, u64 objectid,
|
|
|
|
int *first_xattr_slot)
|
2009-04-27 23:47:50 +08:00
|
|
|
{
|
|
|
|
u32 nritems = btrfs_header_nritems(leaf);
|
|
|
|
struct btrfs_key found_key;
|
2013-06-19 22:16:26 +08:00
|
|
|
static u64 xattr_access = 0;
|
|
|
|
static u64 xattr_default = 0;
|
2009-04-27 23:47:50 +08:00
|
|
|
int scanned = 0;
|
|
|
|
|
2013-06-19 22:16:26 +08:00
|
|
|
if (!xattr_access) {
|
2015-12-02 21:44:35 +08:00
|
|
|
xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
|
|
|
|
strlen(XATTR_NAME_POSIX_ACL_ACCESS));
|
|
|
|
xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
|
|
|
|
strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
|
2013-06-19 22:16:26 +08:00
|
|
|
}
|
|
|
|
|
2009-04-27 23:47:50 +08:00
|
|
|
slot++;
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
*first_xattr_slot = -1;
|
2009-04-27 23:47:50 +08:00
|
|
|
while (slot < nritems) {
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
/* we found a different objectid, there must not be acls */
|
|
|
|
if (found_key.objectid != objectid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* we found an xattr, assume we've got an acl */
|
2013-06-19 22:16:26 +08:00
|
|
|
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
if (*first_xattr_slot == -1)
|
|
|
|
*first_xattr_slot = slot;
|
2013-06-19 22:16:26 +08:00
|
|
|
if (found_key.offset == xattr_access ||
|
|
|
|
found_key.offset == xattr_default)
|
|
|
|
return 1;
|
|
|
|
}
|
2009-04-27 23:47:50 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we found a key greater than an xattr key, there can't
|
|
|
|
* be any acls later on
|
|
|
|
*/
|
|
|
|
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
slot++;
|
|
|
|
scanned++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* it goes inode, inode backrefs, xattrs, extents,
|
|
|
|
* so if there are a ton of hard links to an inode there can
|
|
|
|
* be a lot of backrefs. Don't waste time searching too hard,
|
|
|
|
* this is just an optimization
|
|
|
|
*/
|
|
|
|
if (scanned >= 8)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* we hit the end of the leaf before we found an xattr or
|
|
|
|
* something larger than an xattr. We have to assume the inode
|
|
|
|
* has acls
|
|
|
|
*/
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
if (*first_xattr_slot == -1)
|
|
|
|
*first_xattr_slot = slot;
|
2009-04-27 23:47:50 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* read an inode from the btree into the in-memory inode
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static void btrfs_read_locked_inode(struct inode *inode)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key location;
|
2013-12-26 13:07:06 +08:00
|
|
|
unsigned long ptr;
|
2009-04-27 23:47:50 +08:00
|
|
|
int maybe_acls;
|
2007-07-11 22:18:17 +08:00
|
|
|
u32 rdev;
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret;
|
2011-06-23 15:27:13 +08:00
|
|
|
bool filled = false;
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
int first_xattr_slot;
|
2011-06-23 15:27:13 +08:00
|
|
|
|
|
|
|
ret = btrfs_fill_inode(inode, &rdev);
|
|
|
|
if (!ret)
|
|
|
|
filled = true;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-07-13 02:25:31 +08:00
|
|
|
if (!path)
|
|
|
|
goto make_bad;
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
|
2008-01-09 04:46:30 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
|
2007-10-16 04:14:19 +08:00
|
|
|
if (ret)
|
2007-06-12 18:35:45 +08:00
|
|
|
goto make_bad;
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
2011-06-23 15:27:13 +08:00
|
|
|
|
|
|
|
if (filled)
|
2013-12-26 13:07:06 +08:00
|
|
|
goto cache_index;
|
2011-06-23 15:27:13 +08:00
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
|
|
|
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
|
2011-10-28 20:13:29 +08:00
|
|
|
set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
|
2012-02-11 03:05:07 +08:00
|
|
|
i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
|
|
|
|
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
|
2008-07-18 00:54:05 +08:00
|
|
|
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-12-13 00:39:12 +08:00
|
|
|
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
|
|
|
|
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-12-13 00:39:12 +08:00
|
|
|
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
|
|
|
|
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-12-13 00:39:12 +08:00
|
|
|
inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
|
|
|
|
inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2012-07-04 15:18:07 +08:00
|
|
|
BTRFS_I(inode)->i_otime.tv_sec =
|
|
|
|
btrfs_timespec_sec(leaf, &inode_item->otime);
|
|
|
|
BTRFS_I(inode)->i_otime.tv_nsec =
|
|
|
|
btrfs_timespec_nsec(leaf, &inode_item->otime);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2008-10-09 23:46:29 +08:00
|
|
|
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
|
2008-09-06 04:13:11 +08:00
|
|
|
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
|
|
|
|
|
2015-04-09 12:08:43 +08:00
|
|
|
inode->i_version = btrfs_inode_sequence(leaf, inode_item);
|
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
|
|
|
inode->i_rdev = 0;
|
|
|
|
rdev = btrfs_inode_rdev(leaf, inode_item);
|
|
|
|
|
|
|
|
BTRFS_I(inode)->index_cnt = (u64)-1;
|
|
|
|
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
|
|
|
|
|
|
|
|
cache_index:
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
/*
|
|
|
|
* If we were modified in the current generation and evicted from memory
|
|
|
|
* and then re-read we need to do a full sync since we don't have any
|
|
|
|
* idea about which extents were modified before we were evicted from
|
|
|
|
* cache.
|
2015-04-09 12:08:43 +08:00
|
|
|
*
|
|
|
|
* This is required for both inode re-read from disk and delayed inode
|
|
|
|
* in delayed_nodes_tree.
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
*/
|
|
|
|
if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
|
Btrfs: fix stale dir entries after unlink, inode eviction and fsync
If we remove a hard link from an inode, the inode gets evicted, then
we fsync the inode and then power fail/crash, when the log tree is
replayed, the parent directory inode still has entries pointing to
the name that no longer exists, while our inode no longer has the
BTRFS_INODE_REF_KEY item matching the deleted hard link (as expected),
leaving the filesystem in an inconsistent state. The stale directory
entries can not be deleted (an attempt to delete them causes -ESTALE
errors), which makes it impossible to delete the parent directory.
This happens because we track the id of the transaction where the last
unlink operation for the inode happened (last_unlink_trans) in an
in-memory only field of the inode, that is, a value that is never
persisted in the inode item stored on the fs/subvol btree. So if an
inode is evicted and loaded again, the value for last_unlink_trans is
set to 0, which prevents the fsync from logging the parent directory
at btrfs_log_inode_parent(). So fix this by setting last_unlink_trans
to the id of the transaction that last modified the inode when we
load the inode. This is a pessimistic approach but it always ensures
correctness with the trade off of ocassional full transaction commits
when an fsync is done against the inode in the same transaction where
it was evicted and reloaded when our inode is a directory and often
logging its parent unnecessarily when our inode is not a directory.
The following test case for fstests triggers the problem:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
_cleanup_flakey
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
. ./common/dmflakey
# real QA test starts here
_need_to_be_root
_supported_fs generic
_supported_os Linux
_require_scratch
_require_dm_flakey
_require_metadata_journaling $SCRATCH_DEV
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file with 2 hard links.
mkdir $SCRATCH_MNT/testdir
touch $SCRATCH_MNT/testdir/foo
ln $SCRATCH_MNT/testdir/foo $SCRATCH_MNT/testdir/bar
# Make sure everything done so far is durably persisted.
sync
# Now remove one of the links, trigger inode eviction and then fsync
# our inode.
unlink $SCRATCH_MNT/testdir/bar
echo 2 > /proc/sys/vm/drop_caches
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir/foo
# Silently drop all writes on our scratch device to simulate a power failure.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Allow writes again and mount the fs to trigger log/journal replay.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Now verify our directory entries.
echo "Entries in testdir:"
ls -1 $SCRATCH_MNT/testdir
# If we remove our inode, its parent should become empty and therefore we should
# be able to remove the parent.
rm -f $SCRATCH_MNT/testdir/*
rmdir $SCRATCH_MNT/testdir
_unmount_flakey
# The fstests framework will call fsck against our filesystem which will verify
# that all metadata is in a consistent state.
status=0
exit
The test failed on btrfs with:
generic/098 4s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad)
--- tests/generic/098.out 2015-07-23 18:01:12.616175932 +0100
+++ /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad 2015-07-23 18:04:58.924138308 +0100
@@ -1,3 +1,6 @@
QA output created by 098
Entries in testdir:
+bar
foo
+rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/testdir/foo': Stale file handle
+rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/testdir': Directory not empty
...
(Run 'diff -u tests/generic/098.out /home/fdmanana/git/hub/xfstests/results//generic/098.out.bad' to see the entire diff)
_check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent (see /home/fdmanana/git/hub/xfstests/results//generic/098.full)
$ cat /home/fdmanana/git/hub/xfstests/results//generic/098.full
(...)
checking fs roots
root 5 inode 258 errors 2001, no inode item, link count wrong
unresolved ref dir 257 index 0 namelen 3 name foo filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 257 index 3 namelen 3 name bar filetype 1 errors 5, no dir item, no inode ref
Checking filesystem on /dev/sdc
(...)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-07-24 07:00:19 +08:00
|
|
|
/*
|
|
|
|
* We don't persist the id of the transaction where an unlink operation
|
|
|
|
* against the inode was last made. So here we assume the inode might
|
|
|
|
* have been evicted, and therefore the exact value of last_unlink_trans
|
|
|
|
* lost, and set it to last_trans to avoid metadata inconsistencies
|
|
|
|
* between the inode and its parent if the inode is fsync'ed and the log
|
|
|
|
* replayed. For example, in the scenario:
|
|
|
|
*
|
|
|
|
* touch mydir/foo
|
|
|
|
* ln mydir/foo mydir/bar
|
|
|
|
* sync
|
|
|
|
* unlink mydir/bar
|
|
|
|
* echo 2 > /proc/sys/vm/drop_caches # evicts inode
|
|
|
|
* xfs_io -c fsync mydir/foo
|
|
|
|
* <power failure>
|
|
|
|
* mount fs, triggers fsync log replay
|
|
|
|
*
|
|
|
|
* We must make sure that when we fsync our inode foo we also log its
|
|
|
|
* parent inode, otherwise after log replay the parent still has the
|
|
|
|
* dentry with the "bar" name but our inode foo has a link count of 1
|
|
|
|
* and doesn't have an inode ref with the name "bar" anymore.
|
|
|
|
*
|
|
|
|
* Setting last_unlink_trans to last_trans is a pessimistic approach,
|
|
|
|
* but it guarantees correctness at the expense of ocassional full
|
|
|
|
* transaction commits on fsync if our inode is a directory, or if our
|
|
|
|
* inode is not a directory, logging its parent unnecessarily.
|
|
|
|
*/
|
|
|
|
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
|
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
path->slots[0]++;
|
|
|
|
if (inode->i_nlink != 1 ||
|
|
|
|
path->slots[0] >= btrfs_header_nritems(leaf))
|
|
|
|
goto cache_acl;
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
|
|
|
|
if (location.objectid != btrfs_ino(inode))
|
|
|
|
goto cache_acl;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
|
|
|
|
if (location.type == BTRFS_INODE_REF_KEY) {
|
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
|
|
|
|
ref = (struct btrfs_inode_ref *)ptr;
|
|
|
|
BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
|
|
|
|
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
|
|
|
|
struct btrfs_inode_extref *extref;
|
|
|
|
|
|
|
|
extref = (struct btrfs_inode_extref *)ptr;
|
|
|
|
BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
|
|
|
|
extref);
|
|
|
|
}
|
2011-06-23 15:27:13 +08:00
|
|
|
cache_acl:
|
2009-04-27 23:47:50 +08:00
|
|
|
/*
|
|
|
|
* try to precache a NULL acl entry for files that don't have
|
|
|
|
* any xattrs or acls
|
|
|
|
*/
|
2011-04-20 10:31:50 +08:00
|
|
|
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
btrfs_ino(inode), &first_xattr_slot);
|
|
|
|
if (first_xattr_slot != -1) {
|
|
|
|
path->slots[0] = first_xattr_slot;
|
|
|
|
ret = btrfs_load_inode_props(inode, path);
|
|
|
|
if (ret)
|
|
|
|
btrfs_err(root->fs_info,
|
2014-05-15 22:48:20 +08:00
|
|
|
"error loading props for ino %llu (root %llu): %d",
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
btrfs_ino(inode),
|
|
|
|
root->root_key.objectid, ret);
|
|
|
|
}
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2009-06-25 04:58:48 +08:00
|
|
|
if (!maybe_acls)
|
|
|
|
cache_no_acl(inode);
|
2009-04-27 23:47:50 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
switch (inode->i_mode & S_IFMT) {
|
|
|
|
case S_IFREG:
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2008-01-25 05:13:08 +08:00
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2007-06-12 18:35:45 +08:00
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
break;
|
|
|
|
case S_IFDIR:
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
if (root == root->fs_info->tree_root)
|
|
|
|
inode->i_op = &btrfs_dir_ro_inode_operations;
|
|
|
|
else
|
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
2015-11-17 14:07:57 +08:00
|
|
|
inode_nohighmem(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
inode->i_mapping->a_ops = &btrfs_symlink_aops;
|
|
|
|
break;
|
2007-07-11 22:18:17 +08:00
|
|
|
default:
|
2009-02-04 22:29:13 +08:00
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
2007-07-11 22:18:17 +08:00
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
break;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2009-04-17 16:37:41 +08:00
|
|
|
|
|
|
|
btrfs_update_iflags(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
make_bad:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
make_bad_inode(inode);
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* given a leaf and an inode, copy the inode fields into the leaf
|
|
|
|
*/
|
2008-09-06 04:13:11 +08:00
|
|
|
static void fill_inode_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *leaf,
|
2007-10-16 04:14:19 +08:00
|
|
|
struct btrfs_inode_item *item,
|
2007-06-12 18:35:45 +08:00
|
|
|
struct inode *inode)
|
|
|
|
{
|
2012-12-27 17:01:21 +08:00
|
|
|
struct btrfs_map_token token;
|
|
|
|
|
|
|
|
btrfs_init_map_token(&token);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2012-12-27 17:01:21 +08:00
|
|
|
btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
|
|
|
|
btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
|
|
|
|
btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
|
|
|
|
&token);
|
|
|
|
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
|
|
|
|
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-12-13 00:39:12 +08:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->atime,
|
2012-12-27 17:01:21 +08:00
|
|
|
inode->i_atime.tv_sec, &token);
|
2014-12-13 00:39:12 +08:00
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->atime,
|
2012-12-27 17:01:21 +08:00
|
|
|
inode->i_atime.tv_nsec, &token);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-12-13 00:39:12 +08:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->mtime,
|
2012-12-27 17:01:21 +08:00
|
|
|
inode->i_mtime.tv_sec, &token);
|
2014-12-13 00:39:12 +08:00
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->mtime,
|
2012-12-27 17:01:21 +08:00
|
|
|
inode->i_mtime.tv_nsec, &token);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-12-13 00:39:12 +08:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->ctime,
|
2012-12-27 17:01:21 +08:00
|
|
|
inode->i_ctime.tv_sec, &token);
|
2014-12-13 00:39:12 +08:00
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->ctime,
|
2012-12-27 17:01:21 +08:00
|
|
|
inode->i_ctime.tv_nsec, &token);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2012-07-04 15:18:07 +08:00
|
|
|
btrfs_set_token_timespec_sec(leaf, &item->otime,
|
|
|
|
BTRFS_I(inode)->i_otime.tv_sec, &token);
|
|
|
|
btrfs_set_token_timespec_nsec(leaf, &item->otime,
|
|
|
|
BTRFS_I(inode)->i_otime.tv_nsec, &token);
|
|
|
|
|
2012-12-27 17:01:21 +08:00
|
|
|
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
|
|
|
|
&token);
|
|
|
|
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
|
|
|
|
&token);
|
|
|
|
btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
|
|
|
|
btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
|
|
|
|
btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
|
|
|
|
btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
|
|
|
|
btrfs_set_token_inode_block_group(leaf, item, 0, &token);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
*/
|
2011-11-11 09:39:08 +08:00
|
|
|
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
|
2009-01-06 10:25:51 +08:00
|
|
|
struct btrfs_root *root, struct inode *inode)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
struct btrfs_path *path;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
|
|
|
|
1);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (ret) {
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto failed;
|
|
|
|
}
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
struct btrfs_inode_item);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2008-09-06 04:13:11 +08:00
|
|
|
fill_inode_item(trans, leaf, inode_item, inode);
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-08-11 04:22:09 +08:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
ret = 0;
|
|
|
|
failed:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-11-11 09:39:08 +08:00
|
|
|
/*
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
*/
|
|
|
|
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root, struct inode *inode)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the inode is a free space inode, we can deadlock during commit
|
|
|
|
* if we put it into the delayed code.
|
|
|
|
*
|
|
|
|
* The data relocation inode should also be directly updated
|
|
|
|
* without delay
|
|
|
|
*/
|
2012-07-10 19:28:39 +08:00
|
|
|
if (!btrfs_is_free_space_inode(inode)
|
2014-09-18 23:30:44 +08:00
|
|
|
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
|
|
|
|
&& !root->fs_info->log_root_recovering) {
|
2012-07-25 23:35:53 +08:00
|
|
|
btrfs_update_root_times(trans, root);
|
|
|
|
|
2011-11-11 09:39:08 +08:00
|
|
|
ret = btrfs_delayed_update_inode(trans, root, inode);
|
|
|
|
if (!ret)
|
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return btrfs_update_inode_item(trans, root, inode);
|
|
|
|
}
|
|
|
|
|
2012-10-23 03:43:12 +08:00
|
|
|
noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *inode)
|
2011-11-11 09:39:08 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret == -ENOSPC)
|
|
|
|
return btrfs_update_inode_item(trans, root, inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* unlink helper that gets used here in inode.c and in the tree logging
|
|
|
|
* recovery code. It remove a link in a directory with a given name, and
|
|
|
|
* also drops the back refs in the inode to the directory
|
|
|
|
*/
|
2011-03-05 01:14:37 +08:00
|
|
|
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir, struct inode *inode,
|
|
|
|
const char *name, int name_len)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret = 0;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_dir_item *di;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct btrfs_key key;
|
2008-07-25 00:12:38 +08:00
|
|
|
u64 index;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2007-06-23 02:16:25 +08:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
2011-02-03 11:16:25 +08:00
|
|
|
goto out;
|
2007-06-23 02:16:25 +08:00
|
|
|
}
|
|
|
|
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2007-06-12 18:35:45 +08:00
|
|
|
name, name_len, -1);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
if (!di) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto err;
|
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
2007-06-12 18:35:45 +08:00
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2007-06-23 02:16:25 +08:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
/*
|
|
|
|
* If we don't have dir index, we have to get it by looking up
|
|
|
|
* the inode ref, since we get the inode ref, remove it directly,
|
|
|
|
* it is unnecessary to do delayed deletion.
|
|
|
|
*
|
|
|
|
* But if we have dir index, needn't search inode ref to get it.
|
|
|
|
* Since the inode ref is close to the inode item, it is better
|
|
|
|
* that we delay to delete it, and just do this deletion when
|
|
|
|
* we update the inode item.
|
|
|
|
*/
|
|
|
|
if (BTRFS_I(inode)->dir_index) {
|
|
|
|
ret = btrfs_delayed_delete_inode_ref(inode);
|
|
|
|
if (!ret) {
|
|
|
|
index = BTRFS_I(inode)->dir_index;
|
|
|
|
goto skip_backref;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
dir_ino, &index);
|
2008-07-25 00:12:38 +08:00
|
|
|
if (ret) {
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_info(root->fs_info,
|
|
|
|
"failed to delete reference to %.*s, inode %llu parent %llu",
|
2013-08-20 19:20:07 +08:00
|
|
|
name_len, name, ino, dir_ino);
|
2012-03-12 23:03:00 +08:00
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2008-07-25 00:12:38 +08:00
|
|
|
goto err;
|
|
|
|
}
|
2013-12-26 13:07:06 +08:00
|
|
|
skip_backref:
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2007-06-12 18:35:45 +08:00
|
|
|
goto err;
|
2012-03-12 23:03:00 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2008-09-06 04:13:11 +08:00
|
|
|
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
|
2011-04-20 10:31:50 +08:00
|
|
|
inode, dir_ino);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret != 0 && ret != -ENOENT) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto err;
|
|
|
|
}
|
2008-09-06 04:13:11 +08:00
|
|
|
|
|
|
|
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
|
|
|
|
dir, index);
|
2010-10-30 19:34:24 +08:00
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = 0;
|
2013-04-03 05:02:16 +08:00
|
|
|
else if (ret)
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2007-06-12 18:35:45 +08:00
|
|
|
err:
|
|
|
|
btrfs_free_path(path);
|
2008-09-06 04:13:11 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(inode);
|
|
|
|
inode_inc_iversion(dir);
|
2008-09-06 04:13:11 +08:00
|
|
|
inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
2012-06-26 11:25:22 +08:00
|
|
|
ret = btrfs_update_inode(trans, root, dir);
|
2008-09-06 04:13:11 +08:00
|
|
|
out:
|
2007-06-12 18:35:45 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-05 01:14:37 +08:00
|
|
|
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir, struct inode *inode,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
|
|
|
|
if (!ret) {
|
2013-10-17 03:10:34 +08:00
|
|
|
drop_nlink(inode);
|
2011-03-05 01:14:37 +08:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
/*
|
|
|
|
* helper to start transaction for unlink and rmdir.
|
|
|
|
*
|
2013-05-30 02:54:47 +08:00
|
|
|
* unlink and rmdir are special in btrfs, they do not always free space, so
|
|
|
|
* if we cannot make our reservations the normal way try and see if there is
|
|
|
|
* plenty of slack room in the global reserve to migrate, otherwise we cannot
|
|
|
|
* allow the unlink to occur.
|
2010-05-16 22:48:46 +08:00
|
|
|
*/
|
2013-05-30 02:54:47 +08:00
|
|
|
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
|
2009-09-22 03:56:00 +08:00
|
|
|
{
|
2010-05-16 22:48:46 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2009-09-22 03:56:00 +08:00
|
|
|
|
2011-10-12 02:18:24 +08:00
|
|
|
/*
|
|
|
|
* 1 for the possible orphan item
|
|
|
|
* 1 for the dir item
|
|
|
|
* 1 for the dir index
|
|
|
|
* 1 for the inode ref
|
|
|
|
* 1 for the inode
|
|
|
|
*/
|
2015-11-14 07:57:16 +08:00
|
|
|
return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
|
2010-05-16 22:48:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2010-05-16 22:48:46 +08:00
|
|
|
int ret;
|
|
|
|
|
2013-05-30 02:54:47 +08:00
|
|
|
trans = __unlink_start_trans(dir);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2015-03-18 06:25:59 +08:00
|
|
|
btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0);
|
2009-03-24 22:24:20 +08:00
|
|
|
|
2015-03-18 06:25:59 +08:00
|
|
|
ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
|
2008-09-06 04:13:11 +08:00
|
|
|
dentry->d_name.name, dentry->d_name.len);
|
2011-07-19 15:27:20 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
if (inode->i_nlink == 0) {
|
2008-07-25 00:17:14 +08:00
|
|
|
ret = btrfs_orphan_add(trans, inode);
|
2011-07-19 15:27:20 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2010-05-16 22:48:46 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2011-07-19 15:27:20 +08:00
|
|
|
out:
|
2013-05-30 02:54:47 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-06-12 18:35:45 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir, u64 objectid,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 index;
|
|
|
|
int ret;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2009-09-22 03:56:00 +08:00
|
|
|
name, name_len, -1);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
if (!di)
|
|
|
|
ret = -ENOENT;
|
|
|
|
else
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
|
|
|
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
|
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
|
|
|
|
objectid, root->root_key.objectid,
|
2011-04-20 10:31:50 +08:00
|
|
|
dir_ino, &index, name, name_len);
|
2009-09-22 03:56:00 +08:00
|
|
|
if (ret < 0) {
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret != -ENOENT) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_search_dir_index_item(root, path, dir_ino,
|
2009-09-22 03:56:00 +08:00
|
|
|
name, name_len);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR_OR_NULL(di)) {
|
|
|
|
if (!di)
|
|
|
|
ret = -ENOENT;
|
|
|
|
else
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2009-09-22 03:56:00 +08:00
|
|
|
index = key.offset;
|
|
|
|
}
|
2011-05-23 00:33:42 +08:00
|
|
|
btrfs_release_path(path);
|
2009-09-22 03:56:00 +08:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out;
|
|
|
|
}
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(dir);
|
2009-09-22 03:56:00 +08:00
|
|
|
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
2012-08-09 00:12:59 +08:00
|
|
|
ret = btrfs_update_inode_fallback(trans, root, dir);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret)
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
out:
|
2011-06-15 02:24:32 +08:00
|
|
|
btrfs_free_path(path);
|
2012-03-12 23:03:00 +08:00
|
|
|
return ret;
|
2009-09-22 03:56:00 +08:00
|
|
|
}
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2007-12-22 05:27:21 +08:00
|
|
|
int err = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
2012-09-14 06:04:34 +08:00
|
|
|
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
2007-10-26 03:49:25 +08:00
|
|
|
return -ENOTEMPTY;
|
2012-09-14 06:04:34 +08:00
|
|
|
if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
|
|
|
|
return -EPERM;
|
2007-10-26 03:49:25 +08:00
|
|
|
|
2013-05-30 02:54:47 +08:00
|
|
|
trans = __unlink_start_trans(dir);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (IS_ERR(trans))
|
2009-11-11 10:23:48 +08:00
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
2009-09-22 03:56:00 +08:00
|
|
|
err = btrfs_unlink_subvol(trans, root, dir,
|
|
|
|
BTRFS_I(inode)->location.objectid,
|
|
|
|
dentry->d_name.name,
|
|
|
|
dentry->d_name.len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-07-25 00:17:14 +08:00
|
|
|
err = btrfs_orphan_add(trans, inode);
|
|
|
|
if (err)
|
2009-09-22 03:56:00 +08:00
|
|
|
goto out;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
/* now the directory is empty */
|
2015-03-18 06:25:59 +08:00
|
|
|
err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
|
2008-09-06 04:13:11 +08:00
|
|
|
dentry->d_name.name, dentry->d_name.len);
|
2009-01-06 10:25:51 +08:00
|
|
|
if (!err)
|
2008-07-18 00:54:05 +08:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2009-09-22 03:56:00 +08:00
|
|
|
out:
|
2013-05-30 02:54:47 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-12-13 03:38:19 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-02-04 22:59:29 +08:00
|
|
|
static int truncate_space_check(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
u64 bytes_deleted)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
|
|
|
|
ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
|
|
|
|
bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
|
|
|
|
if (!ret)
|
|
|
|
trans->bytes_reserved += bytes_deleted;
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 19:34:25 +08:00
|
|
|
static int truncate_inline_extent(struct inode *inode,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *found_key,
|
|
|
|
const u64 item_end,
|
|
|
|
const u64 new_size)
|
|
|
|
{
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
int slot = path->slots[0];
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
u32 size = (u32)(new_size - found_key->offset);
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
|
|
|
|
|
|
|
|
if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
|
|
|
|
loff_t offset = new_size;
|
|
|
|
loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Zero out the remaining of the last page of our inline extent,
|
|
|
|
* instead of directly truncating our inline extent here - that
|
|
|
|
* would be much more complex (decompressing all the data, then
|
|
|
|
* compressing the truncated data, which might be bigger than
|
|
|
|
* the size of the inline extent, resize the extent, etc).
|
|
|
|
* We release the path because to get the page we might need to
|
|
|
|
* read the extent item from disk (data not in the page cache).
|
|
|
|
*/
|
|
|
|
btrfs_release_path(path);
|
|
|
|
return btrfs_truncate_page(inode, offset, page_end - offset, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
|
|
|
|
size = btrfs_file_extent_calc_inline_size(size);
|
|
|
|
btrfs_truncate_item(root, path, size, 1);
|
|
|
|
|
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
|
|
|
|
inode_sub_bytes(inode, item_end + 1 - new_size);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
/*
|
|
|
|
* this can truncate away extent items, csum items and directory items.
|
|
|
|
* It starts at a high offset and removes keys until it can't find
|
2008-09-30 03:18:18 +08:00
|
|
|
* any higher than new_size
|
2007-06-12 18:35:45 +08:00
|
|
|
*
|
|
|
|
* csum items that cross the new i_size are truncated to the new size
|
|
|
|
* as well.
|
2008-07-25 00:17:14 +08:00
|
|
|
*
|
|
|
|
* min_type is the minimum key type to truncate down to. If set to 0, this
|
|
|
|
* will kill all the items on this inode, including the INODE_ITEM_KEY.
|
2007-06-12 18:35:45 +08:00
|
|
|
*/
|
2009-11-12 17:35:36 +08:00
|
|
|
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *inode,
|
|
|
|
u64 new_size, u32 min_type)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_file_extent_item *fi;
|
2009-11-12 17:35:36 +08:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
2007-06-12 18:35:45 +08:00
|
|
|
u64 extent_start = 0;
|
2007-10-16 04:15:53 +08:00
|
|
|
u64 extent_num_bytes = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
u64 extent_offset = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
u64 item_end = 0;
|
Btrfs: fix shrinking truncate when the no_holes feature is enabled
If the no_holes feature is enabled, we attempt to shrink a file to a size
that ends up in the middle of a hole and we don't have any file extent
items in the fs/subvol tree that go beyond the new file size (or any
ordered extents that will insert such file extent items), we end up not
updating the inode's disk_i_size, we only update the inode's i_size.
This means that after unmounting and mounting the filesystem, or after
the inode is evicted and reloaded, its i_size ends up being incorrect
(an inode's i_size is set to the disk_i_size field when an inode is
loaded). This happens when btrfs_truncate_inode_items() doesn't find
any file extent items to drop - in this case it never makes a call to
btrfs_ordered_update_i_size() in order to update the inode's disk_i_size.
Example reproducer:
$ mkfs.btrfs -O no-holes -f /dev/sdd
$ mount /dev/sdd /mnt
# Create our test file with some data and durably persist it.
$ xfs_io -f -c "pwrite -S 0xaa 0 128K" /mnt/foo
$ sync
# Append some data to the file, increasing its size, and leave a hole
# between the old size and the start offset if the following write. So
# our file gets a hole in the range [128Kb, 256Kb[.
$ xfs_io -c "truncate 160K" /mnt/foo
# We expect to see our file with a size of 160Kb, with the first 128Kb
# of data all having the value 0xaa and the remaining 32Kb of data all
# having the value 0x00.
$ od -t x1 /mnt/foo
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0400000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0500000
# Now cleanly unmount and mount again the filesystem.
$ umount /mnt
$ mount /dev/sdd /mnt
# We expect to get the same result as before, a file with a size of
# 160Kb, with the first 128Kb of data all having the value 0xaa and the
# remaining 32Kb of data all having the value 0x00.
$ od -t x1 /mnt/foo
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0400000
In the example above the file size/data do not match what they were before
the remount.
Fix this by always calling btrfs_ordered_update_i_size() with a size
matching the size the file was truncated to if btrfs_truncate_inode_items()
is not called for a log tree and no file extent items were dropped. This
ensures the same behaviour as when the no_holes feature is not enabled.
A test case for fstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-21 01:20:09 +08:00
|
|
|
u64 last_size = new_size;
|
2009-11-12 17:35:36 +08:00
|
|
|
u32 found_type = (u8)-1;
|
2007-06-12 18:35:45 +08:00
|
|
|
int found_extent;
|
|
|
|
int del_item;
|
2008-01-30 04:11:36 +08:00
|
|
|
int pending_del_nr = 0;
|
|
|
|
int pending_del_slot = 0;
|
2007-11-01 23:28:41 +08:00
|
|
|
int extent_type = -1;
|
2009-11-12 17:35:36 +08:00
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 ino = btrfs_ino(inode);
|
2014-12-18 01:41:04 +08:00
|
|
|
u64 bytes_deleted = 0;
|
2015-02-03 23:50:16 +08:00
|
|
|
bool be_nice = 0;
|
|
|
|
bool should_throttle = 0;
|
2015-02-04 22:59:29 +08:00
|
|
|
bool should_end = 0;
|
2009-11-12 17:35:36 +08:00
|
|
|
|
|
|
|
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2014-12-18 01:41:04 +08:00
|
|
|
/*
|
|
|
|
* for non-free space inodes and ref cows, we want to back off from
|
|
|
|
* time to time
|
|
|
|
*/
|
|
|
|
if (!btrfs_is_free_space_inode(inode) &&
|
|
|
|
test_bit(BTRFS_ROOT_REF_COWS, &root->state))
|
|
|
|
be_nice = 1;
|
|
|
|
|
2011-07-13 07:44:10 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_BACK;
|
2011-07-13 07:44:10 +08:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
/*
|
|
|
|
* We want to drop from the next block forward in case this new size is
|
|
|
|
* not block aligned since we will be keeping the last block of the
|
|
|
|
* extent just the way it is.
|
|
|
|
*/
|
2014-04-02 19:51:05 +08:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
|
|
|
|
root == root->fs_info->tree_root)
|
2013-02-26 16:10:22 +08:00
|
|
|
btrfs_drop_extent_cache(inode, ALIGN(new_size,
|
|
|
|
root->sectorsize), (u64)-1, 0);
|
2009-11-12 17:35:36 +08:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
/*
|
|
|
|
* This function is also used to drop the items in the log tree before
|
|
|
|
* we relog the inode, so if root != BTRFS_I(inode)->root, it means
|
|
|
|
* it is used to drop the loged items. So we shouldn't kill the delayed
|
|
|
|
* items.
|
|
|
|
*/
|
|
|
|
if (min_type == 0 && root == BTRFS_I(inode)->root)
|
|
|
|
btrfs_kill_delayed_inode_items(inode);
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
key.objectid = ino;
|
2007-06-12 18:35:45 +08:00
|
|
|
key.offset = (u64)-1;
|
2007-10-16 04:14:19 +08:00
|
|
|
key.type = (u8)-1;
|
|
|
|
|
2008-01-30 04:11:36 +08:00
|
|
|
search_again:
|
2014-12-18 01:41:04 +08:00
|
|
|
/*
|
|
|
|
* with a 16K leaf size and 128MB extents, you can actually queue
|
|
|
|
* up a huge file in a single leaf. Most of the time that
|
|
|
|
* bytes_deleted is > 0, it will be huge by the time we get here
|
|
|
|
*/
|
2015-12-15 00:42:10 +08:00
|
|
|
if (be_nice && bytes_deleted > SZ_32M) {
|
2014-12-18 01:41:04 +08:00
|
|
|
if (btrfs_should_end_transaction(trans, root)) {
|
|
|
|
err = -EAGAIN;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
2008-01-30 04:11:36 +08:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
2009-11-12 17:35:36 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
2009-01-06 10:25:51 +08:00
|
|
|
|
2008-01-30 04:11:36 +08:00
|
|
|
if (ret > 0) {
|
2008-09-06 04:13:11 +08:00
|
|
|
/* there are no items in the tree for us to truncate, we're
|
|
|
|
* done
|
|
|
|
*/
|
2009-11-12 17:35:36 +08:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
goto out;
|
2008-01-30 04:11:36 +08:00
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2007-06-12 18:35:45 +08:00
|
|
|
fi = NULL;
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
2014-06-05 00:41:45 +08:00
|
|
|
found_type = found_key.type;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (found_key.objectid != ino)
|
2007-06-12 18:35:45 +08:00
|
|
|
break;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2008-01-30 04:11:36 +08:00
|
|
|
if (found_type < min_type)
|
2007-06-12 18:35:45 +08:00
|
|
|
break;
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
item_end = found_key.offset;
|
2007-06-12 18:35:45 +08:00
|
|
|
if (found_type == BTRFS_EXTENT_DATA_KEY) {
|
2007-10-16 04:14:19 +08:00
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_file_extent_item);
|
2007-11-01 23:28:41 +08:00
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-16 04:14:19 +08:00
|
|
|
item_end +=
|
2007-10-16 04:15:53 +08:00
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2007-11-01 23:28:41 +08:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
item_end += btrfs_file_extent_inline_len(leaf,
|
2014-01-04 13:07:00 +08:00
|
|
|
path->slots[0], fi);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2007-11-08 02:31:09 +08:00
|
|
|
item_end--;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2009-11-12 17:35:36 +08:00
|
|
|
if (found_type > min_type) {
|
|
|
|
del_item = 1;
|
|
|
|
} else {
|
|
|
|
if (item_end < new_size)
|
2007-08-28 04:49:44 +08:00
|
|
|
break;
|
2009-11-12 17:35:36 +08:00
|
|
|
if (found_key.offset >= new_size)
|
|
|
|
del_item = 1;
|
|
|
|
else
|
|
|
|
del_item = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
found_extent = 0;
|
|
|
|
/* FIXME, shrink the extent if the ref count is only 1 */
|
2007-11-01 23:28:41 +08:00
|
|
|
if (found_type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto delete;
|
|
|
|
|
2013-08-30 04:43:28 +08:00
|
|
|
if (del_item)
|
|
|
|
last_size = found_key.offset;
|
|
|
|
else
|
|
|
|
last_size = new_size;
|
|
|
|
|
2007-11-01 23:28:41 +08:00
|
|
|
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
|
2007-06-12 18:35:45 +08:00
|
|
|
u64 num_dec;
|
2007-10-16 04:15:53 +08:00
|
|
|
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
|
2012-01-13 08:10:12 +08:00
|
|
|
if (!del_item) {
|
2007-10-16 04:15:53 +08:00
|
|
|
u64 orig_num_bytes =
|
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2013-02-26 16:10:22 +08:00
|
|
|
extent_num_bytes = ALIGN(new_size -
|
|
|
|
found_key.offset,
|
|
|
|
root->sectorsize);
|
2007-10-16 04:15:53 +08:00
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi,
|
|
|
|
extent_num_bytes);
|
|
|
|
num_dec = (orig_num_bytes -
|
2008-02-09 02:49:28 +08:00
|
|
|
extent_num_bytes);
|
2014-04-02 19:51:05 +08:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS,
|
|
|
|
&root->state) &&
|
|
|
|
extent_start != 0)
|
2008-10-09 23:46:29 +08:00
|
|
|
inode_sub_bytes(inode, num_dec);
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 18:35:45 +08:00
|
|
|
} else {
|
2007-10-16 04:15:53 +08:00
|
|
|
extent_num_bytes =
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf,
|
|
|
|
fi);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
extent_offset = found_key.offset -
|
|
|
|
btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
/* FIXME blocksize != 4096 */
|
2008-02-09 02:49:28 +08:00
|
|
|
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (extent_start != 0) {
|
|
|
|
found_extent = 1;
|
2014-04-02 19:51:05 +08:00
|
|
|
if (test_bit(BTRFS_ROOT_REF_COWS,
|
|
|
|
&root->state))
|
2008-10-09 23:46:29 +08:00
|
|
|
inode_sub_bytes(inode, num_dec);
|
2008-09-06 04:13:11 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-02-09 02:49:28 +08:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
/*
|
|
|
|
* we can't truncate inline items that have had
|
|
|
|
* special encodings
|
|
|
|
*/
|
|
|
|
if (!del_item &&
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) == 0 &&
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
|
2014-01-04 13:07:00 +08:00
|
|
|
|
|
|
|
/*
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 19:34:25 +08:00
|
|
|
* Need to release path in order to truncate a
|
|
|
|
* compressed extent. So delete any accumulated
|
|
|
|
* extent items so far.
|
2014-01-04 13:07:00 +08:00
|
|
|
*/
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 19:34:25 +08:00
|
|
|
if (btrfs_file_extent_compression(leaf, fi) !=
|
|
|
|
BTRFS_COMPRESS_NONE && pending_del_nr) {
|
|
|
|
err = btrfs_del_items(trans, root, path,
|
|
|
|
pending_del_slot,
|
|
|
|
pending_del_nr);
|
|
|
|
if (err) {
|
|
|
|
btrfs_abort_transaction(trans,
|
|
|
|
root,
|
|
|
|
err);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
pending_del_nr = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = truncate_inline_extent(inode, path,
|
|
|
|
&found_key,
|
|
|
|
item_end,
|
|
|
|
new_size);
|
|
|
|
if (err) {
|
|
|
|
btrfs_abort_transaction(trans,
|
|
|
|
root, err);
|
|
|
|
goto error;
|
|
|
|
}
|
2014-04-02 19:51:05 +08:00
|
|
|
} else if (test_bit(BTRFS_ROOT_REF_COWS,
|
|
|
|
&root->state)) {
|
Btrfs: fix truncation of compressed and inlined extents
When truncating a file to a smaller size which consists of an inline
extent that is compressed, we did not discard (or made unusable) the
data between the new file size and the old file size, wasting metadata
space and allowing for the truncated data to be leaked and the data
corruption/loss mentioned below.
We were also not correctly decrementing the number of bytes used by the
inode, we were setting it to zero, giving a wrong report for callers of
the stat(2) syscall. The fsck tool also reported an error about a mismatch
between the nbytes of the file versus the real space used by the file.
Now because we weren't discarding the truncated region of the file, it
was possible for a caller of the clone ioctl to actually read the data
that was truncated, allowing for a security breach without requiring root
access to the system, using only standard filesystem operations. The
scenario is the following:
1) User A creates a file which consists of an inline and compressed
extent with a size of 2000 bytes - the file is not accessible to
any other users (no read, write or execution permission for anyone
else);
2) The user truncates the file to a size of 1000 bytes;
3) User A makes the file world readable;
4) User B creates a file consisting of an inline extent of 2000 bytes;
5) User B issues a clone operation from user A's file into its own
file (using a length argument of 0, clone the whole range);
6) User B now gets to see the 1000 bytes that user A truncated from
its file before it made its file world readbale. User B also lost
the bytes in the range [1000, 2000[ bytes from its own file, but
that might be ok if his/her intention was reading stale data from
user A that was never supposed to be public.
Note that this contrasts with the case where we truncate a file from 2000
bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
this case reading any byte from the range [1000, 2000[ will return a value
of 0x00, instead of the original data.
This problem exists since the clone ioctl was added and happens both with
and without my recent data loss and file corruption fixes for the clone
ioctl (patch "Btrfs: fix file corruption and data loss after cloning
inline extents").
So fix this by truncating the compressed inline extents as we do for the
non-compressed case, which involves decompressing, if the data isn't already
in the page cache, compressing the truncated version of the extent, writing
the compressed content into the inline extent and then truncate it.
The following test case for fstests reproduces the problem. In order for
the test to pass both this fix and my previous fix for the clone ioctl
that forbids cloning a smaller inline extent into a larger one,
which is titled "Btrfs: fix file corruption and data loss after cloning
inline extents", are needed. Without that other fix the test fails in a
different way that does not leak the truncated data, instead part of
destination file gets replaced with zeroes (because the destination file
has a larger inline extent than the source).
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_cloner
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create our test files. File foo is going to be the source of a clone operation
# and consists of a single inline extent with an uncompressed size of 512 bytes,
# while file bar consists of a single inline extent with an uncompressed size of
# 256 bytes. For our test's purpose, it's important that file bar has an inline
# extent with a size smaller than foo's inline extent.
$XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
-c "pwrite -S 0x2a 128 384" \
$SCRATCH_MNT/foo | _filter_xfs_io
$XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
# Now durably persist all metadata and data. We do this to make sure that we get
# on disk an inline extent with a size of 512 bytes for file foo.
sync
# Now truncate our file foo to a smaller size. Because it consists of a
# compressed and inline extent, btrfs did not shrink the inline extent to the
# new size (if the extent was not compressed, btrfs would shrink it to 128
# bytes), it only updates the inode's i_size to 128 bytes.
$XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
# Now clone foo's inline extent into bar.
# This clone operation should fail with errno EOPNOTSUPP because the source
# file consists only of an inline extent and the file's size is smaller than
# the inline extent of the destination (128 bytes < 256 bytes). However the
# clone ioctl was not prepared to deal with a file that has a size smaller
# than the size of its inline extent (something that happens only for compressed
# inline extents), resulting in copying the full inline extent from the source
# file into the destination file.
#
# Note that btrfs' clone operation for inline extents consists of removing the
# inline extent from the destination inode and copy the inline extent from the
# source inode into the destination inode, meaning that if the destination
# inode's inline extent is larger (N bytes) than the source inode's inline
# extent (M bytes), some bytes (N - M bytes) will be lost from the destination
# file. Btrfs could copy the source inline extent's data into the destination's
# inline extent so that we would not lose any data, but that's currently not
# done due to the complexity that would be needed to deal with such cases
# (specially when one or both extents are compressed), returning EOPNOTSUPP, as
# it's normally not a very common case to clone very small files (only case
# where we get inline extents) and copying inline extents does not save any
# space (unlike for normal, non-inlined extents).
$CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
# Now because the above clone operation used to succeed, and due to foo's inline
# extent not being shinked by the truncate operation, our file bar got the whole
# inline extent copied from foo, making us lose the last 128 bytes from bar
# which got replaced by the bytes in range [128, 256[ from foo before foo was
# truncated - in other words, data loss from bar and being able to read old and
# stale data from foo that should not be possible to read anymore through normal
# filesystem operations. Contrast with the case where we truncate a file from a
# size N to a smaller size M, truncate it back to size N and then read the range
# [M, N[, we should always get the value 0x00 for all the bytes in that range.
# We expected the clone operation to fail with errno EOPNOTSUPP and therefore
# not modify our file's bar data/metadata. So its content should be 256 bytes
# long with all bytes having the value 0xbb.
#
# Without the btrfs bug fix, the clone operation succeeded and resulted in
# leaking truncated data from foo, the bytes that belonged to its range
# [128, 256[, and losing data from bar in that same range. So reading the
# file gave us the following content:
#
# 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
# *
# 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
# *
# 0000400
echo "File bar's content after the clone operation:"
od -t x1 $SCRATCH_MNT/bar
# Also because the foo's inline extent was not shrunk by the truncate
# operation, btrfs' fsck, which is run by the fstests framework everytime a
# test completes, failed reporting the following error:
#
# root 5 inode 257 errors 400, nbytes wrong
status=0
exit
Cc: stable@vger.kernel.org
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-16 19:34:25 +08:00
|
|
|
inode_sub_bytes(inode, item_end + 1 - new_size);
|
2008-02-09 02:49:28 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2007-11-01 23:28:41 +08:00
|
|
|
delete:
|
2007-06-12 18:35:45 +08:00
|
|
|
if (del_item) {
|
2008-01-30 04:11:36 +08:00
|
|
|
if (!pending_del_nr) {
|
|
|
|
/* no pending yet, add ourselves */
|
|
|
|
pending_del_slot = path->slots[0];
|
|
|
|
pending_del_nr = 1;
|
|
|
|
} else if (pending_del_nr &&
|
|
|
|
path->slots[0] + 1 == pending_del_slot) {
|
|
|
|
/* hop on the pending chunk */
|
|
|
|
pending_del_nr++;
|
|
|
|
pending_del_slot = path->slots[0];
|
|
|
|
} else {
|
2009-01-06 10:25:51 +08:00
|
|
|
BUG();
|
2008-01-30 04:11:36 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2015-02-04 22:59:29 +08:00
|
|
|
should_throttle = 0;
|
|
|
|
|
2014-04-02 19:51:05 +08:00
|
|
|
if (found_extent &&
|
|
|
|
(test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
|
|
|
|
root == root->fs_info->tree_root)) {
|
2009-03-13 23:00:37 +08:00
|
|
|
btrfs_set_path_blocking(path);
|
2014-12-18 01:41:04 +08:00
|
|
|
bytes_deleted += extent_num_bytes;
|
2007-06-12 18:35:45 +08:00
|
|
|
ret = btrfs_free_extent(trans, root, extent_start,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
extent_num_bytes, 0,
|
|
|
|
btrfs_header_owner(leaf),
|
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 14:52:54 +08:00
|
|
|
ino, extent_offset);
|
2007-06-12 18:35:45 +08:00
|
|
|
BUG_ON(ret);
|
2015-02-03 23:50:16 +08:00
|
|
|
if (btrfs_should_throttle_delayed_refs(trans, root))
|
2014-12-18 01:41:04 +08:00
|
|
|
btrfs_async_run_delayed_refs(root,
|
|
|
|
trans->delayed_ref_updates * 2, 0);
|
2015-02-04 22:59:29 +08:00
|
|
|
if (be_nice) {
|
|
|
|
if (truncate_space_check(trans, root,
|
|
|
|
extent_num_bytes)) {
|
|
|
|
should_end = 1;
|
|
|
|
}
|
|
|
|
if (btrfs_should_throttle_delayed_refs(trans,
|
|
|
|
root)) {
|
|
|
|
should_throttle = 1;
|
|
|
|
}
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-01-30 04:11:36 +08:00
|
|
|
|
2009-11-12 17:35:36 +08:00
|
|
|
if (found_type == BTRFS_INODE_ITEM_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (path->slots[0] == 0 ||
|
2015-02-03 23:50:16 +08:00
|
|
|
path->slots[0] != pending_del_slot ||
|
2015-02-04 22:59:29 +08:00
|
|
|
should_throttle || should_end) {
|
2009-11-12 17:35:36 +08:00
|
|
|
if (pending_del_nr) {
|
|
|
|
ret = btrfs_del_items(trans, root, path,
|
|
|
|
pending_del_slot,
|
|
|
|
pending_del_nr);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans,
|
|
|
|
root, ret);
|
|
|
|
goto error;
|
|
|
|
}
|
2009-11-12 17:35:36 +08:00
|
|
|
pending_del_nr = 0;
|
|
|
|
}
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2015-02-04 22:59:29 +08:00
|
|
|
if (should_throttle) {
|
2015-02-03 23:50:16 +08:00
|
|
|
unsigned long updates = trans->delayed_ref_updates;
|
|
|
|
if (updates) {
|
|
|
|
trans->delayed_ref_updates = 0;
|
|
|
|
ret = btrfs_run_delayed_refs(trans, root, updates * 2);
|
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
|
|
|
}
|
|
|
|
}
|
2015-02-04 22:59:29 +08:00
|
|
|
/*
|
|
|
|
* if we failed to refill our space rsv, bail out
|
|
|
|
* and let the transaction restart
|
|
|
|
*/
|
|
|
|
if (should_end) {
|
|
|
|
err = -EAGAIN;
|
|
|
|
goto error;
|
|
|
|
}
|
2008-01-30 04:11:36 +08:00
|
|
|
goto search_again;
|
2009-11-12 17:35:36 +08:00
|
|
|
} else {
|
|
|
|
path->slots[0]--;
|
2008-01-30 04:11:36 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2009-11-12 17:35:36 +08:00
|
|
|
out:
|
2008-01-30 04:11:36 +08:00
|
|
|
if (pending_del_nr) {
|
|
|
|
ret = btrfs_del_items(trans, root, path, pending_del_slot,
|
|
|
|
pending_del_nr);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret)
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2008-01-30 04:11:36 +08:00
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
error:
|
Btrfs: fix shrinking truncate when the no_holes feature is enabled
If the no_holes feature is enabled, we attempt to shrink a file to a size
that ends up in the middle of a hole and we don't have any file extent
items in the fs/subvol tree that go beyond the new file size (or any
ordered extents that will insert such file extent items), we end up not
updating the inode's disk_i_size, we only update the inode's i_size.
This means that after unmounting and mounting the filesystem, or after
the inode is evicted and reloaded, its i_size ends up being incorrect
(an inode's i_size is set to the disk_i_size field when an inode is
loaded). This happens when btrfs_truncate_inode_items() doesn't find
any file extent items to drop - in this case it never makes a call to
btrfs_ordered_update_i_size() in order to update the inode's disk_i_size.
Example reproducer:
$ mkfs.btrfs -O no-holes -f /dev/sdd
$ mount /dev/sdd /mnt
# Create our test file with some data and durably persist it.
$ xfs_io -f -c "pwrite -S 0xaa 0 128K" /mnt/foo
$ sync
# Append some data to the file, increasing its size, and leave a hole
# between the old size and the start offset if the following write. So
# our file gets a hole in the range [128Kb, 256Kb[.
$ xfs_io -c "truncate 160K" /mnt/foo
# We expect to see our file with a size of 160Kb, with the first 128Kb
# of data all having the value 0xaa and the remaining 32Kb of data all
# having the value 0x00.
$ od -t x1 /mnt/foo
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0400000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
*
0500000
# Now cleanly unmount and mount again the filesystem.
$ umount /mnt
$ mount /dev/sdd /mnt
# We expect to get the same result as before, a file with a size of
# 160Kb, with the first 128Kb of data all having the value 0xaa and the
# remaining 32Kb of data all having the value 0x00.
$ od -t x1 /mnt/foo
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0400000
In the example above the file size/data do not match what they were before
the remount.
Fix this by always calling btrfs_ordered_update_i_size() with a size
matching the size the file was truncated to if btrfs_truncate_inode_items()
is not called for a log tree and no file extent items were dropped. This
ensures the same behaviour as when the no_holes feature is not enabled.
A test case for fstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-06-21 01:20:09 +08:00
|
|
|
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
|
2013-08-30 04:43:28 +08:00
|
|
|
btrfs_ordered_update_i_size(inode, last_size, NULL);
|
2014-12-18 01:41:04 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
btrfs_free_path(path);
|
2014-12-18 01:41:04 +08:00
|
|
|
|
2015-12-15 00:42:10 +08:00
|
|
|
if (be_nice && bytes_deleted > SZ_32M) {
|
2014-12-18 01:41:04 +08:00
|
|
|
unsigned long updates = trans->delayed_ref_updates;
|
|
|
|
if (updates) {
|
|
|
|
trans->delayed_ref_updates = 0;
|
|
|
|
ret = btrfs_run_delayed_refs(trans, root, updates * 2);
|
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
|
|
|
}
|
|
|
|
}
|
2009-11-12 17:35:36 +08:00
|
|
|
return err;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2012-08-30 02:27:18 +08:00
|
|
|
* btrfs_truncate_page - read, zero a chunk and write a page
|
|
|
|
* @inode - inode that we're zeroing
|
|
|
|
* @from - the offset to start zeroing
|
|
|
|
* @len - the length to zero, 0 to zero the entire range respective to the
|
|
|
|
* offset
|
|
|
|
* @front - zero up to the offset instead of from the offset on
|
|
|
|
*
|
|
|
|
* This will find the page for the "from" offset and cow the page and zero the
|
|
|
|
* part we want to zero. This is used with truncate and hole punching.
|
2007-06-12 18:35:45 +08:00
|
|
|
*/
|
2012-08-30 02:27:18 +08:00
|
|
|
int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
|
|
|
|
int front)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2012-08-30 02:27:18 +08:00
|
|
|
struct address_space *mapping = inode->i_mapping;
|
2007-10-16 04:15:53 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-07-18 00:53:50 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-04 03:33:23 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-18 00:53:50 +08:00
|
|
|
char *kaddr;
|
2007-10-16 04:15:53 +08:00
|
|
|
u32 blocksize = root->sectorsize;
|
2007-06-12 18:35:45 +08:00
|
|
|
pgoff_t index = from >> PAGE_CACHE_SHIFT;
|
|
|
|
unsigned offset = from & (PAGE_CACHE_SIZE-1);
|
|
|
|
struct page *page;
|
2011-09-22 03:05:58 +08:00
|
|
|
gfp_t mask = btrfs_alloc_write_mask(mapping);
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret = 0;
|
2007-08-28 04:49:44 +08:00
|
|
|
u64 page_start;
|
2008-07-18 00:53:50 +08:00
|
|
|
u64 page_end;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2012-08-30 02:27:18 +08:00
|
|
|
if ((offset & (blocksize - 1)) == 0 &&
|
|
|
|
(!len || ((len & (blocksize - 1)) == 0)))
|
2007-06-12 18:35:45 +08:00
|
|
|
goto out;
|
2015-09-08 17:25:55 +08:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode,
|
2015-09-08 17:25:54 +08:00
|
|
|
round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
|
2009-10-14 04:46:49 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2008-05-15 21:13:45 +08:00
|
|
|
again:
|
2011-09-22 03:05:58 +08:00
|
|
|
page = find_or_create_page(mapping, index, mask);
|
2009-10-14 04:46:49 +08:00
|
|
|
if (!page) {
|
2015-09-08 17:25:55 +08:00
|
|
|
btrfs_delalloc_release_space(inode,
|
2015-09-08 17:25:54 +08:00
|
|
|
round_down(from, PAGE_CACHE_SIZE),
|
|
|
|
PAGE_CACHE_SIZE);
|
2012-12-05 18:56:13 +08:00
|
|
|
ret = -ENOMEM;
|
2007-06-12 18:35:45 +08:00
|
|
|
goto out;
|
2009-10-14 04:46:49 +08:00
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
|
|
|
|
page_start = page_offset(page);
|
|
|
|
page_end = page_start + PAGE_CACHE_SIZE - 1;
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
if (!PageUptodate(page)) {
|
2007-06-16 01:50:00 +08:00
|
|
|
ret = btrfs_readpage(NULL, page);
|
2007-06-12 18:35:45 +08:00
|
|
|
lock_page(page);
|
2008-05-15 21:13:45 +08:00
|
|
|
if (page->mapping != mapping) {
|
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
|
|
|
goto again;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
ret = -EIO;
|
2008-07-24 21:41:53 +08:00
|
|
|
goto out_unlock;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
}
|
2008-05-15 21:13:45 +08:00
|
|
|
wait_on_page_writeback(page);
|
2008-07-18 00:53:50 +08:00
|
|
|
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
|
2008-07-18 00:53:50 +08:00
|
|
|
set_page_extent_mapped(page);
|
|
|
|
|
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
|
|
|
if (ordered) {
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-18 00:53:50 +08:00
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
2008-07-18 01:53:27 +08:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-07-18 00:53:50 +08:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2012-09-06 09:10:51 +08:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
|
2010-02-04 03:33:23 +08:00
|
|
|
0, 0, &cached_state, GFP_NOFS);
|
2009-10-14 04:46:49 +08:00
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
|
|
|
|
&cached_state);
|
2009-09-12 04:12:44 +08:00
|
|
|
if (ret) {
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2009-09-12 04:12:44 +08:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
if (offset != PAGE_CACHE_SIZE) {
|
2012-08-30 02:27:18 +08:00
|
|
|
if (!len)
|
|
|
|
len = PAGE_CACHE_SIZE - offset;
|
2008-07-18 00:53:50 +08:00
|
|
|
kaddr = kmap(page);
|
2012-08-30 02:27:18 +08:00
|
|
|
if (front)
|
|
|
|
memset(kaddr, 0, offset);
|
|
|
|
else
|
|
|
|
memset(kaddr + offset, 0, len);
|
2008-07-18 00:53:50 +08:00
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
2008-07-18 00:53:51 +08:00
|
|
|
ClearPageChecked(page);
|
2008-07-18 00:53:50 +08:00
|
|
|
set_page_dirty(page);
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
|
|
|
|
GFP_NOFS);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2008-07-24 21:41:53 +08:00
|
|
|
out_unlock:
|
2009-10-14 04:46:49 +08:00
|
|
|
if (ret)
|
2015-09-08 17:25:55 +08:00
|
|
|
btrfs_delalloc_release_space(inode, page_start,
|
|
|
|
PAGE_CACHE_SIZE);
|
2007-06-12 18:35:45 +08:00
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-10-23 00:18:51 +08:00
|
|
|
static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
|
|
|
|
u64 offset, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Still need to make sure the inode looks like it's been updated so
|
|
|
|
* that any holes get logged if we fsync.
|
|
|
|
*/
|
|
|
|
if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
|
|
|
|
BTRFS_I(inode)->last_trans = root->fs_info->generation;
|
|
|
|
BTRFS_I(inode)->last_sub_trans = root->log_transid;
|
|
|
|
BTRFS_I(inode)->last_log_commit = root->last_log_commit;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 1 - for the one we're dropping
|
|
|
|
* 1 - for the one we're adding
|
|
|
|
* 1 - for updating the inode.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
|
|
ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
|
|
|
|
0, 0, len, 0, len, 0, 0, 0);
|
|
|
|
if (ret)
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
else
|
|
|
|
btrfs_update_inode(trans, root, inode);
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-05 04:46:53 +08:00
|
|
|
/*
|
|
|
|
* This function puts in dummy file extents for the area we're creating a hole
|
|
|
|
* for. So if we are truncating this file to a larger size we need to insert
|
|
|
|
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
|
|
|
|
* the range between oldsize and size
|
|
|
|
*/
|
2011-02-01 04:30:16 +08:00
|
|
|
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2008-10-31 02:19:41 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-05-16 22:48:46 +08:00
|
|
|
struct extent_map *em = NULL;
|
2010-02-04 03:33:23 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
2013-02-26 16:10:22 +08:00
|
|
|
u64 hole_start = ALIGN(oldsize, root->sectorsize);
|
|
|
|
u64 block_end = ALIGN(size, root->sectorsize);
|
2008-10-31 02:19:41 +08:00
|
|
|
u64 last_byte;
|
|
|
|
u64 cur_offset;
|
|
|
|
u64 hole_size;
|
2009-09-12 04:12:44 +08:00
|
|
|
int err = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2013-06-18 05:14:39 +08:00
|
|
|
/*
|
|
|
|
* If our size started in the middle of a page we need to zero out the
|
|
|
|
* rest of the page before we expand the i_size, otherwise we could
|
|
|
|
* expose stale data.
|
|
|
|
*/
|
|
|
|
err = btrfs_truncate_page(inode, oldsize, 0, 0);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2008-10-31 02:19:41 +08:00
|
|
|
if (size <= hole_start)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
Btrfs: improve jitter performance of the sequential buffered write
The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.
It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.
So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.
According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.
Test Environment:
CPU: 1CPU * 4Cores
Memory: 6GB
Partition: 20GB
Test Command:
# sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
> --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-26 13:15:27 +08:00
|
|
|
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(io_tree, hole_start, block_end - 1,
|
2012-03-01 21:57:19 +08:00
|
|
|
&cached_state);
|
Btrfs: improve jitter performance of the sequential buffered write
The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.
It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.
So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.
According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.
Test Environment:
CPU: 1CPU * 4Cores
Memory: 6GB
Partition: 20GB
Test Command:
# sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
> --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-26 13:15:27 +08:00
|
|
|
ordered = btrfs_lookup_ordered_range(inode, hole_start,
|
|
|
|
block_end - hole_start);
|
2008-10-31 02:19:41 +08:00
|
|
|
if (!ordered)
|
|
|
|
break;
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1,
|
|
|
|
&cached_state, GFP_NOFS);
|
Btrfs: improve jitter performance of the sequential buffered write
The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.
It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.
So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.
According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.
Test Environment:
CPU: 1CPU * 4Cores
Memory: 6GB
Partition: 20GB
Test Command:
# sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
> --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-26 13:15:27 +08:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-10-31 02:19:41 +08:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2008-10-31 02:19:41 +08:00
|
|
|
cur_offset = hole_start;
|
|
|
|
while (1) {
|
|
|
|
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
|
|
|
|
block_end - cur_offset, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
err = PTR_ERR(em);
|
2013-01-09 03:37:58 +08:00
|
|
|
em = NULL;
|
2012-03-12 23:03:00 +08:00
|
|
|
break;
|
|
|
|
}
|
2008-10-31 02:19:41 +08:00
|
|
|
last_byte = min(extent_map_end(em), block_end);
|
2013-02-26 16:10:22 +08:00
|
|
|
last_byte = ALIGN(last_byte , root->sectorsize);
|
2009-11-12 17:35:36 +08:00
|
|
|
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
struct extent_map *hole_em;
|
2008-10-31 02:19:41 +08:00
|
|
|
hole_size = last_byte - cur_offset;
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2013-10-23 00:18:51 +08:00
|
|
|
err = maybe_insert_hole(root, inode, cur_offset,
|
|
|
|
hole_size);
|
|
|
|
if (err)
|
2011-02-01 05:03:11 +08:00
|
|
|
break;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
|
|
|
cur_offset + hole_size - 1, 0);
|
|
|
|
hole_em = alloc_extent_map();
|
|
|
|
if (!hole_em) {
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
hole_em->start = cur_offset;
|
|
|
|
hole_em->len = hole_size;
|
|
|
|
hole_em->orig_start = cur_offset;
|
2009-11-12 17:35:36 +08:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
hole_em->block_start = EXTENT_MAP_HOLE;
|
|
|
|
hole_em->block_len = 0;
|
2012-12-03 23:31:19 +08:00
|
|
|
hole_em->orig_block_len = 0;
|
2013-04-05 02:31:27 +08:00
|
|
|
hole_em->ram_bytes = hole_size;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
|
|
|
|
hole_em->compress_type = BTRFS_COMPRESS_NONE;
|
2013-10-23 00:18:51 +08:00
|
|
|
hole_em->generation = root->fs_info->generation;
|
2009-11-12 17:35:36 +08:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
while (1) {
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
err = add_extent_mapping(em_tree, hole_em, 1);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
if (err != -EEXIST)
|
|
|
|
break;
|
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
|
|
|
cur_offset +
|
|
|
|
hole_size - 1, 0);
|
|
|
|
}
|
|
|
|
free_extent_map(hole_em);
|
2008-10-31 02:19:41 +08:00
|
|
|
}
|
2013-10-23 00:18:51 +08:00
|
|
|
next:
|
2008-10-31 02:19:41 +08:00
|
|
|
free_extent_map(em);
|
2010-05-16 22:48:46 +08:00
|
|
|
em = NULL;
|
2008-10-31 02:19:41 +08:00
|
|
|
cur_offset = last_byte;
|
2009-11-12 17:35:36 +08:00
|
|
|
if (cur_offset >= block_end)
|
2008-10-31 02:19:41 +08:00
|
|
|
break;
|
|
|
|
}
|
2010-05-16 22:48:46 +08:00
|
|
|
free_extent_map(em);
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
|
|
|
|
GFP_NOFS);
|
2008-10-31 02:19:41 +08:00
|
|
|
return err;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2013-01-12 10:57:22 +08:00
|
|
|
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
|
2009-11-12 17:35:36 +08:00
|
|
|
{
|
2011-12-15 09:12:01 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2011-02-01 04:30:16 +08:00
|
|
|
loff_t oldsize = i_size_read(inode);
|
2013-01-12 10:57:22 +08:00
|
|
|
loff_t newsize = attr->ia_size;
|
|
|
|
int mask = attr->ia_valid;
|
2009-11-12 17:35:36 +08:00
|
|
|
int ret;
|
|
|
|
|
2013-01-12 10:57:22 +08:00
|
|
|
/*
|
|
|
|
* The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
|
|
|
|
* special case where we need to update the times despite not having
|
|
|
|
* these flags set. For all other operations the VFS set these flags
|
|
|
|
* explicitly if it wants a timestamp update.
|
|
|
|
*/
|
2013-11-19 23:17:07 +08:00
|
|
|
if (newsize != oldsize) {
|
|
|
|
inode_inc_iversion(inode);
|
|
|
|
if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
|
|
|
|
inode->i_ctime = inode->i_mtime =
|
|
|
|
current_fs_time(inode->i_sb);
|
|
|
|
}
|
2013-01-12 10:57:22 +08:00
|
|
|
|
2011-02-01 04:30:16 +08:00
|
|
|
if (newsize > oldsize) {
|
2013-09-13 06:13:56 +08:00
|
|
|
truncate_pagecache(inode, newsize);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
/*
|
|
|
|
* Don't do an expanding truncate while snapshoting is ongoing.
|
|
|
|
* This is to ensure the snapshot captures a fully consistent
|
|
|
|
* state of this file - if the snapshot captures this expanding
|
|
|
|
* truncation, it must capture all writes that happened before
|
|
|
|
* this truncation.
|
|
|
|
*/
|
2016-01-06 18:56:36 +08:00
|
|
|
btrfs_wait_for_snapshot_creation(root);
|
2011-02-01 04:30:16 +08:00
|
|
|
ret = btrfs_cont_expand(inode, oldsize, newsize);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2009-11-12 17:35:36 +08:00
|
|
|
return ret;
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
}
|
2009-11-12 17:35:36 +08:00
|
|
|
|
2011-12-15 09:12:01 +08:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2011-12-15 09:12:01 +08:00
|
|
|
return PTR_ERR(trans);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
}
|
2011-12-15 09:12:01 +08:00
|
|
|
|
|
|
|
i_size_write(inode, newsize);
|
|
|
|
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 19:57:59 +08:00
|
|
|
btrfs_end_write_no_snapshoting(root);
|
2012-01-13 08:10:12 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2011-02-01 04:30:16 +08:00
|
|
|
} else {
|
2009-11-12 17:35:36 +08:00
|
|
|
|
2011-02-01 04:30:16 +08:00
|
|
|
/*
|
|
|
|
* We're truncating a file that used to have good data down to
|
|
|
|
* zero. Make sure it gets into the ordered flush list so that
|
|
|
|
* any new writes get down to disk quickly.
|
|
|
|
*/
|
|
|
|
if (newsize == 0)
|
2012-05-24 02:13:11 +08:00
|
|
|
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
2009-11-12 17:35:36 +08:00
|
|
|
|
2013-01-08 06:03:21 +08:00
|
|
|
/*
|
|
|
|
* 1 for the orphan item we're going to add
|
|
|
|
* 1 for the orphan item deletion.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to do this in case we fail at _any_ point during the
|
|
|
|
* actual truncate. Once we do the truncate_setsize we could
|
|
|
|
* invalidate pages which forces any outstanding ordered io to
|
|
|
|
* be instantly completed which will give us extents that need
|
|
|
|
* to be truncated. If we fail to get an orphan inode down we
|
|
|
|
* could have left over extents that were never meant to live,
|
|
|
|
* so we need to garuntee from this point on that everything
|
|
|
|
* will be consistent.
|
|
|
|
*/
|
|
|
|
ret = btrfs_orphan_add(trans, inode);
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2011-02-01 04:30:16 +08:00
|
|
|
/* we don't support swapfiles, so vmtruncate shouldn't fail */
|
|
|
|
truncate_setsize(inode, newsize);
|
2013-02-08 15:01:08 +08:00
|
|
|
|
|
|
|
/* Disable nonlocked read DIO to avoid the end less truncate */
|
|
|
|
btrfs_inode_block_unlocked_dio(inode);
|
|
|
|
inode_dio_wait(inode);
|
|
|
|
btrfs_inode_resume_unlocked_dio(inode);
|
|
|
|
|
2011-02-01 04:30:16 +08:00
|
|
|
ret = btrfs_truncate(inode);
|
2013-08-30 04:43:28 +08:00
|
|
|
if (ret && inode->i_nlink) {
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* failed to truncate, disk_i_size is only adjusted down
|
|
|
|
* as we remove extents, so it should represent the true
|
|
|
|
* size of the inode, so reset the in memory size and
|
|
|
|
* delete our orphan entry.
|
|
|
|
*/
|
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
|
|
|
|
err = btrfs_orphan_del(trans, inode);
|
|
|
|
if (err)
|
|
|
|
btrfs_abort_transaction(trans, root, err);
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
}
|
2009-11-12 17:35:36 +08:00
|
|
|
}
|
|
|
|
|
2011-02-01 04:30:16 +08:00
|
|
|
return ret;
|
2009-11-12 17:35:36 +08:00
|
|
|
}
|
|
|
|
|
2008-10-31 02:19:41 +08:00
|
|
|
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
|
|
|
|
{
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2010-12-20 16:04:08 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-10-31 02:19:41 +08:00
|
|
|
int err;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2010-12-20 16:04:08 +08:00
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
|
2008-10-31 02:19:41 +08:00
|
|
|
err = inode_change_ok(inode, attr);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2007-08-30 23:54:02 +08:00
|
|
|
|
2009-04-01 01:27:11 +08:00
|
|
|
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
|
2013-01-12 10:57:22 +08:00
|
|
|
err = btrfs_setsize(inode, attr);
|
2009-11-12 17:35:36 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-10-31 02:19:41 +08:00
|
|
|
|
2010-06-04 17:30:02 +08:00
|
|
|
if (attr->ia_valid) {
|
|
|
|
setattr_copy(inode, attr);
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(inode);
|
2011-11-30 23:45:38 +08:00
|
|
|
err = btrfs_dirty_inode(inode);
|
2010-06-04 17:30:02 +08:00
|
|
|
|
2011-11-30 23:45:38 +08:00
|
|
|
if (!err && attr->ia_valid & ATTR_MODE)
|
2013-12-20 21:16:43 +08:00
|
|
|
err = posix_acl_chmod(inode, inode->i_mode);
|
2010-06-04 17:30:02 +08:00
|
|
|
}
|
2008-07-25 00:16:36 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
return err;
|
|
|
|
}
|
2008-01-15 05:24:38 +08:00
|
|
|
|
2013-11-20 06:29:35 +08:00
|
|
|
/*
|
|
|
|
* While truncating the inode pages during eviction, we get the VFS calling
|
|
|
|
* btrfs_invalidatepage() against each page of the inode. This is slow because
|
|
|
|
* the calls to btrfs_invalidatepage() result in a huge amount of calls to
|
|
|
|
* lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
|
|
|
|
* extent_state structures over and over, wasting lots of time.
|
|
|
|
*
|
|
|
|
* Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
|
|
|
|
* those expensive operations on a per page basis and do only the ordered io
|
|
|
|
* finishing, while we release here the extent_map and extent_state structures,
|
|
|
|
* without the excessive merging and splitting.
|
|
|
|
*/
|
|
|
|
static void evict_inode_truncate_pages(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct rb_node *node;
|
|
|
|
|
|
|
|
ASSERT(inode->i_state & I_FREEING);
|
2014-04-04 05:47:49 +08:00
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
2013-11-20 06:29:35 +08:00
|
|
|
|
|
|
|
write_lock(&map_tree->lock);
|
|
|
|
while (!RB_EMPTY_ROOT(&map_tree->map)) {
|
|
|
|
struct extent_map *em;
|
|
|
|
|
|
|
|
node = rb_first(&map_tree->map);
|
|
|
|
em = rb_entry(node, struct extent_map, rb_node);
|
2013-12-14 15:27:31 +08:00
|
|
|
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
|
2013-11-20 06:29:35 +08:00
|
|
|
remove_extent_mapping(map_tree, em);
|
|
|
|
free_extent_map(em);
|
2014-08-08 09:47:05 +08:00
|
|
|
if (need_resched()) {
|
|
|
|
write_unlock(&map_tree->lock);
|
|
|
|
cond_resched();
|
|
|
|
write_lock(&map_tree->lock);
|
|
|
|
}
|
2013-11-20 06:29:35 +08:00
|
|
|
}
|
|
|
|
write_unlock(&map_tree->lock);
|
|
|
|
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 07:55:42 +08:00
|
|
|
/*
|
|
|
|
* Keep looping until we have no more ranges in the io tree.
|
|
|
|
* We can have ongoing bios started by readpages (called from readahead)
|
2015-06-10 19:55:41 +08:00
|
|
|
* that have their endio callback (extent_io.c:end_bio_extent_readpage)
|
|
|
|
* still in progress (unlocked the pages in the bio but did not yet
|
|
|
|
* unlocked the ranges in the io tree). Therefore this means some
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 07:55:42 +08:00
|
|
|
* ranges can still be locked and eviction started because before
|
|
|
|
* submitting those bios, which are executed by a separate task (work
|
|
|
|
* queue kthread), inode references (inode->i_count) were not taken
|
|
|
|
* (which would be dropped in the end io callback of each bio).
|
|
|
|
* Therefore here we effectively end up waiting for those bios and
|
|
|
|
* anyone else holding locked ranges without having bumped the inode's
|
|
|
|
* reference count - if we don't do it, when they access the inode's
|
|
|
|
* io_tree to unlock a range it may be too late, leading to an
|
|
|
|
* use-after-free issue.
|
|
|
|
*/
|
2013-11-20 06:29:35 +08:00
|
|
|
spin_lock(&io_tree->lock);
|
|
|
|
while (!RB_EMPTY_ROOT(&io_tree->state)) {
|
|
|
|
struct extent_state *state;
|
|
|
|
struct extent_state *cached_state = NULL;
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 07:55:42 +08:00
|
|
|
u64 start;
|
|
|
|
u64 end;
|
2013-11-20 06:29:35 +08:00
|
|
|
|
|
|
|
node = rb_first(&io_tree->state);
|
|
|
|
state = rb_entry(node, struct extent_state, rb_node);
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 07:55:42 +08:00
|
|
|
start = state->start;
|
|
|
|
end = state->end;
|
2013-11-20 06:29:35 +08:00
|
|
|
spin_unlock(&io_tree->lock);
|
|
|
|
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(io_tree, start, end, &cached_state);
|
2015-09-29 10:35:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If still has DELALLOC flag, the extent didn't reach disk,
|
|
|
|
* and its reserved space won't be freed by delayed_ref.
|
|
|
|
* So we need to free its reserved space here.
|
|
|
|
* (Refer to comment in btrfs_invalidatepage, case 2)
|
|
|
|
*
|
|
|
|
* Note, end is the bytenr of last byte, so we need + 1 here.
|
|
|
|
*/
|
|
|
|
if (state->state & EXTENT_DELALLOC)
|
|
|
|
btrfs_qgroup_free_data(inode, start, end - start + 1);
|
|
|
|
|
Btrfs: fix hang during inode eviction due to concurrent readahead
Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:
[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836] Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm D ffff8800724cfc38 0 20488 7747 0x00000000
[ 5281.977506] ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461] ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541] ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066] [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341] [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127] [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715] [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680] [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200] [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781] [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735] [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796] [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806] [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120] [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562] [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815] [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920] [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066] #0: (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b
This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:
1) readpages() is called while a reference on the inode is held, so
eviction can not be triggered before readpages() returns. It also
locks one or more ranges in the inode's io_tree (which is done at
extent_io.c:__do_contiguous_readpages());
2) readpages() submits several read bios, all with an end io callback
that runs extent_io.c:end_bio_extent_readpage() and that is executed
by other task when a bio finishes, corresponding to a work queue
(fs_info->end_io_workers) worker kthread. This callback unlocks
the ranges in the inode's io_tree that were previously locked in
step 1;
3) readpages() returns, the reference on the inode is dropped;
4) One or more of the read bios previously submitted are still not
complete (their end io callback was not yet invoked or has not
yet finished execution);
5) Inode eviction is triggered (through an unlink call for example).
The inode reference count was not incremented before submitting
the read bios, therefore this is possible;
6) The eviction handler starts executing and enters the loop that
iterates over all extent states in the inode's io_tree;
7) The loop picks one extent state record and uses its ->start and
->end fields, after releasing the inode's io_tree spinlock, to
call lock_extent_bits() and clear_extent_bit(). The call to lock
the range [state->start, state->end] blocks because the whole
range or a part of it was locked by the previous call to
readpages() and the corresponding end io callback, which unlocks
the range was not yet executed;
8) The end io callback for the read bio is executed and unlocks the
range [state->start, state->end] (or a superset of that range).
And at clear_extent_bit() the extent_state record state is used
as a second argument to split_state(), which sets state->start to
a larger value;
9) The task executing the eviction handler is woken up by the task
executing the bio's end io callback (through clear_state_bit) and
the eviction handler locks the range
[old value for state->start, state->end]. Shortly after, when
calling clear_extent_bit(), it unlocks the range
[new value for state->start, state->end], so it ends up unlocking
only part of the range that it locked, leaving an extent state
record in the io_tree that represents the unlocked subrange;
10) The eviction handler loop, in its next iteration, gets the
extent_state record for the subrange that it did not unlock in the
previous step and then tries to lock it, resulting in an hang.
So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.
Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.
Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-05-26 07:55:42 +08:00
|
|
|
clear_extent_bit(io_tree, start, end,
|
2013-11-20 06:29:35 +08:00
|
|
|
EXTENT_LOCKED | EXTENT_DIRTY |
|
|
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DEFRAG, 1, 1,
|
|
|
|
&cached_state, GFP_NOFS);
|
|
|
|
|
2014-08-08 09:47:05 +08:00
|
|
|
cond_resched();
|
2013-11-20 06:29:35 +08:00
|
|
|
spin_lock(&io_tree->lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&io_tree->lock);
|
|
|
|
}
|
|
|
|
|
2010-06-07 23:35:40 +08:00
|
|
|
void btrfs_evict_inode(struct inode *inode)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-09-27 03:46:06 +08:00
|
|
|
struct btrfs_block_rsv *rsv, *global_rsv;
|
2015-02-25 04:35:51 +08:00
|
|
|
int steal_from_global = 0;
|
2011-08-19 22:29:59 +08:00
|
|
|
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret;
|
|
|
|
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
trace_btrfs_inode_evict(inode);
|
|
|
|
|
2013-11-20 06:29:35 +08:00
|
|
|
evict_inode_truncate_pages(inode);
|
|
|
|
|
2013-09-05 22:58:43 +08:00
|
|
|
if (inode->i_nlink &&
|
|
|
|
((btrfs_root_refs(&root->root_item) != 0 &&
|
|
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
|
|
|
|
btrfs_is_free_space_inode(inode)))
|
2010-06-07 23:35:40 +08:00
|
|
|
goto no_delete;
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
if (is_bad_inode(inode)) {
|
2008-07-25 00:17:14 +08:00
|
|
|
btrfs_orphan_del(NULL, inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
goto no_delete;
|
|
|
|
}
|
2010-06-07 23:35:40 +08:00
|
|
|
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
|
2015-09-12 09:44:17 +08:00
|
|
|
if (!special_file(inode->i_mode))
|
|
|
|
btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
Btrfs: cleanup the read failure record after write or when the inode is freeing
After the data is written successfully, we should cleanup the read failure record
in that range because
- If we set data COW for the file, the range that the failure record pointed to is
mapped to a new place, so it is invalid.
- If we set no data COW for the file, and if there is no error during writting,
the corrupted data is corrected, so the failure record can be removed. And if
some errors happen on the mirrors, we also needn't worry about it because the
failure record will be recreated if we read the same place again.
Sometimes, we may fail to correct the data, so the failure records will be left
in the tree, we need free them when we free the inode or the memory leak happens.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-12 18:44:04 +08:00
|
|
|
btrfs_free_io_failure_record(inode, 0, (u64)-1);
|
|
|
|
|
2009-11-12 17:34:40 +08:00
|
|
|
if (root->fs_info->log_root_recovering) {
|
2012-06-26 11:59:09 +08:00
|
|
|
BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
|
2012-05-24 02:26:42 +08:00
|
|
|
&BTRFS_I(inode)->runtime_flags));
|
2009-11-12 17:34:40 +08:00
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
|
2009-09-22 04:00:26 +08:00
|
|
|
if (inode->i_nlink > 0) {
|
2013-09-05 22:58:43 +08:00
|
|
|
BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
|
|
|
|
root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
|
2009-09-22 04:00:26 +08:00
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
|
2012-12-19 14:59:51 +08:00
|
|
|
ret = btrfs_commit_inode_delayed_inode(inode);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
|
2012-09-06 18:02:28 +08:00
|
|
|
rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
|
2011-08-06 01:22:24 +08:00
|
|
|
if (!rsv) {
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
|
|
|
goto no_delete;
|
|
|
|
}
|
2011-08-29 23:01:31 +08:00
|
|
|
rsv->size = min_size;
|
2012-08-28 05:48:15 +08:00
|
|
|
rsv->failfast = 1;
|
2011-09-27 03:46:06 +08:00
|
|
|
global_rsv = &root->fs_info->global_block_rsv;
|
2011-08-06 01:22:24 +08:00
|
|
|
|
2008-07-18 00:54:05 +08:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2011-08-06 01:22:24 +08:00
|
|
|
/*
|
Btrfs: fix corrupted metadata in the snapshot
When we delete a inode, we will remove all the delayed items including delayed
inode update, and then truncate all the relative metadata. If there is lots of
metadata, we will end the current transaction, and start a new transaction to
truncate the left metadata. In this way, we will leave a inode item that its
link counter is > 0, and also may leave some directory index items in fs/file tree
after the current transaction ends. In other words, the metadata in this fs/file tree
is inconsistent. If we create a snapshot for this tree now, we will find a inode with
corrupted metadata in the new snapshot, and we won't continue to drop the left metadata,
because its link counter is not 0.
We fix this problem by updating the inode item before the current transaction ends.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2012-09-07 15:43:32 +08:00
|
|
|
* This is a bit simpler than btrfs_truncate since we've already
|
|
|
|
* reserved our space for our orphan item in the unlink, so we just
|
|
|
|
* need to reserve some slack space in case we add bytes and update
|
|
|
|
* inode item when doing the truncate.
|
2011-08-06 01:22:24 +08:00
|
|
|
*/
|
2009-11-12 17:35:36 +08:00
|
|
|
while (1) {
|
Btrfs: improve the noflush reservation
In some places(such as: evicting inode), we just can not flush the reserved
space of delalloc, flushing the delayed directory index and delayed inode
is OK, but we don't try to flush those things and just go back when there is
no enough space to be reserved. This patch fixes this problem.
We defined 3 types of the flush operations: NO_FLUSH, FLUSH_LIMIT and FLUSH_ALL.
If we can in the transaction, we should not flush anything, or the deadlock
would happen, so use NO_FLUSH. If we flushing the reserved space of delalloc
would cause deadlock, use FLUSH_LIMIT. In the other cases, FLUSH_ALL is used,
and we will flush all things.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2012-10-16 19:33:38 +08:00
|
|
|
ret = btrfs_block_rsv_refill(root, rsv, min_size,
|
|
|
|
BTRFS_RESERVE_FLUSH_LIMIT);
|
2011-09-27 03:46:06 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try and steal from the global reserve since we will
|
|
|
|
* likely not use this space anyway, we want to try as
|
|
|
|
* hard as possible to get this to work.
|
|
|
|
*/
|
|
|
|
if (ret)
|
2015-02-25 04:35:51 +08:00
|
|
|
steal_from_global++;
|
|
|
|
else
|
|
|
|
steal_from_global = 0;
|
|
|
|
ret = 0;
|
2010-05-16 22:49:58 +08:00
|
|
|
|
2015-02-25 04:35:51 +08:00
|
|
|
/*
|
|
|
|
* steal_from_global == 0: we reserved stuff, hooray!
|
|
|
|
* steal_from_global == 1: we didn't reserve stuff, boo!
|
|
|
|
* steal_from_global == 2: we've committed, still not a lot of
|
|
|
|
* room but maybe we'll have room in the global reserve this
|
|
|
|
* time.
|
|
|
|
* steal_from_global == 3: abandon all hope!
|
|
|
|
*/
|
|
|
|
if (steal_from_global > 2) {
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_warn(root->fs_info,
|
|
|
|
"Could not get space for a delete, will truncate on mount %d",
|
|
|
|
ret);
|
2011-08-06 01:22:24 +08:00
|
|
|
btrfs_orphan_del(NULL, inode);
|
|
|
|
btrfs_free_block_rsv(root, rsv);
|
|
|
|
goto no_delete;
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2012-12-19 14:59:51 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-08-06 01:22:24 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
|
|
|
btrfs_free_block_rsv(root, rsv);
|
|
|
|
goto no_delete;
|
2010-05-16 22:49:58 +08:00
|
|
|
}
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2015-02-25 04:35:51 +08:00
|
|
|
/*
|
|
|
|
* We can't just steal from the global reserve, we need tomake
|
|
|
|
* sure there is room to do it, if not we need to commit and try
|
|
|
|
* again.
|
|
|
|
*/
|
|
|
|
if (steal_from_global) {
|
|
|
|
if (!btrfs_check_space_for_delayed_refs(trans, root))
|
|
|
|
ret = btrfs_block_rsv_migrate(global_rsv, rsv,
|
|
|
|
min_size);
|
|
|
|
else
|
|
|
|
ret = -ENOSPC;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Couldn't steal from the global reserve, we have too much
|
|
|
|
* pending stuff built up, commit the transaction and try it
|
|
|
|
* again.
|
|
|
|
*/
|
|
|
|
if (ret) {
|
|
|
|
ret = btrfs_commit_transaction(trans, root);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
|
|
|
btrfs_free_block_rsv(root, rsv);
|
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
steal_from_global = 0;
|
|
|
|
}
|
|
|
|
|
2011-08-06 01:22:24 +08:00
|
|
|
trans->block_rsv = rsv;
|
|
|
|
|
2010-05-16 22:49:58 +08:00
|
|
|
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
|
2014-12-18 01:41:04 +08:00
|
|
|
if (ret != -ENOSPC && ret != -EAGAIN)
|
2009-11-12 17:35:36 +08:00
|
|
|
break;
|
2008-01-30 04:11:36 +08:00
|
|
|
|
Btrfs: fix corrupted metadata in the snapshot
When we delete a inode, we will remove all the delayed items including delayed
inode update, and then truncate all the relative metadata. If there is lots of
metadata, we will end the current transaction, and start a new transaction to
truncate the left metadata. In this way, we will leave a inode item that its
link counter is > 0, and also may leave some directory index items in fs/file tree
after the current transaction ends. In other words, the metadata in this fs/file tree
is inconsistent. If we create a snapshot for this tree now, we will find a inode with
corrupted metadata in the new snapshot, and we won't continue to drop the left metadata,
because its link counter is not 0.
We fix this problem by updating the inode item before the current transaction ends.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2012-09-07 15:43:32 +08:00
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
2009-11-12 17:35:36 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
trans = NULL;
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2009-11-12 17:35:36 +08:00
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2011-08-06 01:22:24 +08:00
|
|
|
btrfs_free_block_rsv(root, rsv);
|
|
|
|
|
2013-08-14 02:10:08 +08:00
|
|
|
/*
|
|
|
|
* Errors here aren't a big deal, it just means we leave orphan items
|
|
|
|
* in the tree. They will be cleaned up on the next mount.
|
|
|
|
*/
|
2009-11-12 17:35:36 +08:00
|
|
|
if (ret == 0) {
|
2011-08-06 01:22:24 +08:00
|
|
|
trans->block_rsv = root->orphan_block_rsv;
|
2013-08-14 02:10:08 +08:00
|
|
|
btrfs_orphan_del(trans, inode);
|
|
|
|
} else {
|
|
|
|
btrfs_orphan_del(NULL, inode);
|
2009-11-12 17:35:36 +08:00
|
|
|
}
|
2007-06-23 02:16:25 +08:00
|
|
|
|
2011-08-06 01:22:24 +08:00
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
if (!(root == root->fs_info->tree_root ||
|
|
|
|
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
|
2011-04-20 10:31:50 +08:00
|
|
|
btrfs_return_ino(root, btrfs_ino(inode));
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
|
2007-06-23 02:16:25 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-06-12 18:35:45 +08:00
|
|
|
no_delete:
|
2013-05-15 15:48:15 +08:00
|
|
|
btrfs_remove_delayed_node(inode);
|
2012-05-03 20:48:02 +08:00
|
|
|
clear_inode(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this returns the key found in the dir entry in the location pointer.
|
|
|
|
* If no dir entries were found, location->objectid is 0.
|
|
|
|
*/
|
|
|
|
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
|
|
|
|
struct btrfs_key *location)
|
|
|
|
{
|
|
|
|
const char *name = dentry->d_name.name;
|
|
|
|
int namelen = dentry->d_name.len;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-10-26 03:48:28 +08:00
|
|
|
int ret = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2007-12-13 03:38:19 +08:00
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
|
2007-06-12 18:35:45 +08:00
|
|
|
namelen, 0);
|
2007-10-26 03:48:28 +08:00
|
|
|
if (IS_ERR(di))
|
|
|
|
ret = PTR_ERR(di);
|
2009-01-06 10:25:51 +08:00
|
|
|
|
2011-04-20 00:00:01 +08:00
|
|
|
if (IS_ERR_OR_NULL(di))
|
2007-12-13 03:38:19 +08:00
|
|
|
goto out_err;
|
2009-01-06 10:25:51 +08:00
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
|
2007-06-12 18:35:45 +08:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
2007-12-13 03:38:19 +08:00
|
|
|
out_err:
|
|
|
|
location->objectid = 0;
|
|
|
|
goto out;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* when we hit a tree root in a directory, the btrfs part of the inode
|
|
|
|
* needs to be changed to reflect the root directory of the tree root. This
|
|
|
|
* is kind of like crossing a mount point.
|
|
|
|
*/
|
|
|
|
static int fixup_tree_root_location(struct btrfs_root *root,
|
2009-09-22 03:56:00 +08:00
|
|
|
struct inode *dir,
|
|
|
|
struct dentry *dentry,
|
|
|
|
struct btrfs_key *location,
|
|
|
|
struct btrfs_root **sub_root)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2009-09-22 03:56:00 +08:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *new_root;
|
|
|
|
struct btrfs_root_ref *ref;
|
|
|
|
struct extent_buffer *leaf;
|
2015-01-03 02:36:14 +08:00
|
|
|
struct btrfs_key key;
|
2009-09-22 03:56:00 +08:00
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
err = -ENOENT;
|
2015-01-03 02:36:14 +08:00
|
|
|
key.objectid = BTRFS_I(dir)->root->root_key.objectid;
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
key.offset = location->objectid;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
|
|
|
|
0, 0);
|
2009-09-22 03:56:00 +08:00
|
|
|
if (ret) {
|
|
|
|
if (ret < 0)
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
|
2011-04-20 10:31:50 +08:00
|
|
|
if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
|
2009-09-22 03:56:00 +08:00
|
|
|
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
|
|
|
|
goto out;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
|
|
|
|
(unsigned long)(ref + 1),
|
|
|
|
dentry->d_name.len);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
|
|
|
|
if (IS_ERR(new_root)) {
|
|
|
|
err = PTR_ERR(new_root);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
*sub_root = new_root;
|
|
|
|
location->objectid = btrfs_root_dirid(&new_root->root_item);
|
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
location->offset = 0;
|
|
|
|
err = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return err;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static void inode_tree_add(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_inode *entry;
|
2009-08-21 16:09:44 +08:00
|
|
|
struct rb_node **p;
|
|
|
|
struct rb_node *parent;
|
2013-09-02 19:19:13 +08:00
|
|
|
struct rb_node *new = &BTRFS_I(inode)->rb_node;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 ino = btrfs_ino(inode);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2010-10-24 03:19:20 +08:00
|
|
|
if (inode_unhashed(inode))
|
2009-09-22 04:00:26 +08:00
|
|
|
return;
|
2013-05-15 15:48:16 +08:00
|
|
|
parent = NULL;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
spin_lock(&root->inode_lock);
|
2013-05-15 15:48:16 +08:00
|
|
|
p = &root->inode_tree.rb_node;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct btrfs_inode, rb_node);
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (ino < btrfs_ino(&entry->vfs_inode))
|
2009-08-21 16:09:44 +08:00
|
|
|
p = &parent->rb_left;
|
2011-04-20 10:31:50 +08:00
|
|
|
else if (ino > btrfs_ino(&entry->vfs_inode))
|
2009-08-21 16:09:44 +08:00
|
|
|
p = &parent->rb_right;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
else {
|
|
|
|
WARN_ON(!(entry->vfs_inode.i_state &
|
2010-06-03 05:38:30 +08:00
|
|
|
(I_WILL_FREE | I_FREEING)));
|
2013-09-02 19:19:13 +08:00
|
|
|
rb_replace_node(parent, new, &root->inode_tree);
|
2009-08-21 16:09:44 +08:00
|
|
|
RB_CLEAR_NODE(parent);
|
|
|
|
spin_unlock(&root->inode_lock);
|
2013-09-02 19:19:13 +08:00
|
|
|
return;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
}
|
2013-09-02 19:19:13 +08:00
|
|
|
rb_link_node(new, parent, p);
|
|
|
|
rb_insert_color(new, &root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void inode_tree_del(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2009-09-22 04:00:26 +08:00
|
|
|
int empty = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
|
2009-08-21 16:09:44 +08:00
|
|
|
spin_lock(&root->inode_lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
|
|
|
|
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
|
|
|
|
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
|
2009-09-22 04:00:26 +08:00
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
2009-08-21 16:09:44 +08:00
|
|
|
spin_unlock(&root->inode_lock);
|
2009-09-22 04:00:26 +08:00
|
|
|
|
2013-09-05 22:58:43 +08:00
|
|
|
if (empty && btrfs_root_refs(&root->root_item) == 0) {
|
2009-09-22 04:00:26 +08:00
|
|
|
synchronize_srcu(&root->fs_info->subvol_srcu);
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
if (empty)
|
|
|
|
btrfs_add_dead_root(root);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-03-01 21:56:26 +08:00
|
|
|
void btrfs_invalidate_inodes(struct btrfs_root *root)
|
2009-09-22 04:00:26 +08:00
|
|
|
{
|
|
|
|
struct rb_node *node;
|
|
|
|
struct rb_node *prev;
|
|
|
|
struct btrfs_inode *entry;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 objectid = 0;
|
|
|
|
|
2014-02-10 17:37:25 +08:00
|
|
|
if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
|
|
|
|
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
|
2009-09-22 04:00:26 +08:00
|
|
|
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
again:
|
|
|
|
node = root->inode_tree.rb_node;
|
|
|
|
prev = NULL;
|
|
|
|
while (node) {
|
|
|
|
prev = node;
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (objectid < btrfs_ino(&entry->vfs_inode))
|
2009-09-22 04:00:26 +08:00
|
|
|
node = node->rb_left;
|
2011-04-20 10:31:50 +08:00
|
|
|
else if (objectid > btrfs_ino(&entry->vfs_inode))
|
2009-09-22 04:00:26 +08:00
|
|
|
node = node->rb_right;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!node) {
|
|
|
|
while (prev) {
|
|
|
|
entry = rb_entry(prev, struct btrfs_inode, rb_node);
|
2011-04-20 10:31:50 +08:00
|
|
|
if (objectid <= btrfs_ino(&entry->vfs_inode)) {
|
2009-09-22 04:00:26 +08:00
|
|
|
node = prev;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
prev = rb_next(prev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (node) {
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
2011-04-20 10:31:50 +08:00
|
|
|
objectid = btrfs_ino(&entry->vfs_inode) + 1;
|
2009-09-22 04:00:26 +08:00
|
|
|
inode = igrab(&entry->vfs_inode);
|
|
|
|
if (inode) {
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
if (atomic_read(&inode->i_count) > 1)
|
|
|
|
d_prune_aliases(inode);
|
|
|
|
/*
|
2010-06-08 01:43:19 +08:00
|
|
|
* btrfs_drop_inode will have it removed from
|
2009-09-22 04:00:26 +08:00
|
|
|
* the inode cache when its usage count
|
|
|
|
* hits zero.
|
|
|
|
*/
|
|
|
|
iput(inode);
|
|
|
|
cond_resched();
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cond_resched_lock(&root->inode_lock))
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
node = rb_next(node);
|
|
|
|
}
|
|
|
|
spin_unlock(&root->inode_lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
}
|
|
|
|
|
2008-09-06 04:13:11 +08:00
|
|
|
static int btrfs_init_locked_inode(struct inode *inode, void *p)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args *args = p;
|
2014-01-10 09:28:00 +08:00
|
|
|
inode->i_ino = args->location->objectid;
|
|
|
|
memcpy(&BTRFS_I(inode)->location, args->location,
|
|
|
|
sizeof(*args->location));
|
2008-09-06 04:13:11 +08:00
|
|
|
BTRFS_I(inode)->root = args->root;
|
2007-06-12 18:35:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_find_actor(struct inode *inode, void *opaque)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args *args = opaque;
|
2014-01-10 09:28:00 +08:00
|
|
|
return args->location->objectid == BTRFS_I(inode)->location.objectid &&
|
2009-01-06 10:25:51 +08:00
|
|
|
args->root == BTRFS_I(inode)->root;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
static struct inode *btrfs_iget_locked(struct super_block *s,
|
2014-01-10 09:28:00 +08:00
|
|
|
struct btrfs_key *location,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
struct btrfs_root *root)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_iget_args args;
|
2014-01-10 09:28:00 +08:00
|
|
|
unsigned long hashval = btrfs_inode_hash(location->objectid, root);
|
Btrfs: improve inode hash function/inode lookup
Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:
1) When we have many subvolumes. Each subvolume has its own btree
where its files and directories are added to, and each has its
own objectid (inode number) namespace. This means that if we have
N subvolumes, and all have inode number X associated to a file or
directory, the corresponding inodes all map to the same hash table
entry, resulting in a bucket (hlist_head list) with N elements;
2) On 32 bits machines. Th VFS hash values are unsigned longs, which
are 32 bits wide on 32 bits machines, and the inode (objectid)
numbers are 64 bits unsigned integers. We simply cast the inode
numbers to hash values, which means that for all inodes with the
same 32 bits lower half, the same hash bucket is used for all of
them. For example, all inodes with a number (objectid) between
0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
the same hash table bucket.
This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-10-07 05:22:33 +08:00
|
|
|
|
2014-01-10 09:28:00 +08:00
|
|
|
args.location = location;
|
2007-06-12 18:35:45 +08:00
|
|
|
args.root = root;
|
|
|
|
|
Btrfs: improve inode hash function/inode lookup
Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:
1) When we have many subvolumes. Each subvolume has its own btree
where its files and directories are added to, and each has its
own objectid (inode number) namespace. This means that if we have
N subvolumes, and all have inode number X associated to a file or
directory, the corresponding inodes all map to the same hash table
entry, resulting in a bucket (hlist_head list) with N elements;
2) On 32 bits machines. Th VFS hash values are unsigned longs, which
are 32 bits wide on 32 bits machines, and the inode (objectid)
numbers are 64 bits unsigned integers. We simply cast the inode
numbers to hash values, which means that for all inodes with the
same 32 bits lower half, the same hash bucket is used for all of
them. For example, all inodes with a number (objectid) between
0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
the same hash table bucket.
This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-10-07 05:22:33 +08:00
|
|
|
inode = iget5_locked(s, hashval, btrfs_find_actor,
|
2007-06-12 18:35:45 +08:00
|
|
|
btrfs_init_locked_inode,
|
|
|
|
(void *)&args);
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2008-07-21 04:31:04 +08:00
|
|
|
/* Get an inode object given its location and corresponding root.
|
|
|
|
* Returns in *is_new if the inode was read from disk
|
|
|
|
*/
|
|
|
|
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-05 01:38:27 +08:00
|
|
|
struct btrfs_root *root, int *new)
|
2008-07-21 04:31:04 +08:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
2014-01-10 09:28:00 +08:00
|
|
|
inode = btrfs_iget_locked(s, location, root);
|
2008-07-21 04:31:04 +08:00
|
|
|
if (!inode)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2008-07-21 04:31:04 +08:00
|
|
|
|
|
|
|
if (inode->i_state & I_NEW) {
|
|
|
|
btrfs_read_locked_inode(inode);
|
2011-07-13 02:25:31 +08:00
|
|
|
if (!is_bad_inode(inode)) {
|
|
|
|
inode_tree_add(inode);
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
if (new)
|
|
|
|
*new = 1;
|
|
|
|
} else {
|
2011-09-11 22:52:24 +08:00
|
|
|
unlock_new_inode(inode);
|
|
|
|
iput(inode);
|
|
|
|
inode = ERR_PTR(-ESTALE);
|
2011-07-13 02:25:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-21 04:31:04 +08:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
static struct inode *new_simple_dir(struct super_block *s,
|
|
|
|
struct btrfs_key *key,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct inode *inode = new_inode(s);
|
|
|
|
|
|
|
|
if (!inode)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
BTRFS_I(inode)->root = root;
|
|
|
|
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
|
2012-05-24 02:13:11 +08:00
|
|
|
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
|
2012-02-21 17:04:28 +08:00
|
|
|
inode->i_op = &btrfs_dir_ro_inode_operations;
|
2009-09-22 03:56:00 +08:00
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
|
2012-07-04 15:18:07 +08:00
|
|
|
inode->i_mtime = CURRENT_TIME;
|
|
|
|
inode->i_atime = inode->i_mtime;
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
|
|
|
BTRFS_I(inode)->i_otime = inode->i_mtime;
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2009-01-06 10:25:51 +08:00
|
|
|
struct inode *inode;
|
2009-09-22 03:56:00 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_root *sub_root = root;
|
|
|
|
struct btrfs_key location;
|
2009-09-22 04:00:26 +08:00
|
|
|
int index;
|
2011-06-29 04:18:59 +08:00
|
|
|
int ret = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
if (dentry->d_name.len > BTRFS_NAME_LEN)
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2012-11-29 00:30:53 +08:00
|
|
|
ret = btrfs_inode_by_name(dir, dentry, &location);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return ERR_PTR(ret);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
if (location.objectid == 0)
|
2013-12-13 08:51:42 +08:00
|
|
|
return ERR_PTR(-ENOENT);
|
2009-09-22 03:56:00 +08:00
|
|
|
|
|
|
|
if (location.type == BTRFS_INODE_ITEM_KEY) {
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-05 01:38:27 +08:00
|
|
|
inode = btrfs_iget(dir->i_sb, &location, root, NULL);
|
2009-09-22 03:56:00 +08:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
|
|
|
|
|
2009-09-22 04:00:26 +08:00
|
|
|
index = srcu_read_lock(&root->fs_info->subvol_srcu);
|
2009-09-22 03:56:00 +08:00
|
|
|
ret = fixup_tree_root_location(root, dir, dentry,
|
|
|
|
&location, &sub_root);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret != -ENOENT)
|
|
|
|
inode = ERR_PTR(ret);
|
|
|
|
else
|
|
|
|
inode = new_simple_dir(dir->i_sb, &location, sub_root);
|
|
|
|
} else {
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-05 01:38:27 +08:00
|
|
|
inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2009-09-22 04:00:26 +08:00
|
|
|
srcu_read_unlock(&root->fs_info->subvol_srcu, index);
|
|
|
|
|
2011-01-25 03:55:19 +08:00
|
|
|
if (!IS_ERR(inode) && root != sub_root) {
|
2009-11-12 17:34:40 +08:00
|
|
|
down_read(&root->fs_info->cleanup_work_sem);
|
|
|
|
if (!(inode->i_sb->s_flags & MS_RDONLY))
|
2011-02-01 05:22:42 +08:00
|
|
|
ret = btrfs_orphan_cleanup(sub_root);
|
2009-11-12 17:34:40 +08:00
|
|
|
up_read(&root->fs_info->cleanup_work_sem);
|
2013-06-04 09:39:49 +08:00
|
|
|
if (ret) {
|
|
|
|
iput(inode);
|
2011-02-01 05:22:42 +08:00
|
|
|
inode = ERR_PTR(ret);
|
2013-06-04 09:39:49 +08:00
|
|
|
}
|
2009-11-12 17:34:40 +08:00
|
|
|
}
|
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2011-01-07 14:49:23 +08:00
|
|
|
static int btrfs_dentry_delete(const struct dentry *dentry)
|
2009-09-22 04:00:26 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root;
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2009-09-22 04:00:26 +08:00
|
|
|
|
2012-02-21 17:04:28 +08:00
|
|
|
if (!inode && !IS_ROOT(dentry))
|
2015-03-18 06:25:59 +08:00
|
|
|
inode = d_inode(dentry->d_parent);
|
2009-09-22 04:00:26 +08:00
|
|
|
|
2012-02-21 17:04:28 +08:00
|
|
|
if (inode) {
|
|
|
|
root = BTRFS_I(inode)->root;
|
2009-10-09 21:25:16 +08:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
|
|
|
return 1;
|
2012-02-21 17:04:28 +08:00
|
|
|
|
|
|
|
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
|
|
|
return 1;
|
2009-10-09 21:25:16 +08:00
|
|
|
}
|
2009-09-22 04:00:26 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-06-29 04:18:59 +08:00
|
|
|
static void btrfs_dentry_release(struct dentry *dentry)
|
|
|
|
{
|
2014-04-14 14:37:02 +08:00
|
|
|
kfree(dentry->d_fsdata);
|
2011-06-29 04:18:59 +08:00
|
|
|
}
|
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
|
2012-06-11 05:13:09 +08:00
|
|
|
unsigned int flags)
|
2008-11-18 10:02:50 +08:00
|
|
|
{
|
2013-12-13 08:51:42 +08:00
|
|
|
struct inode *inode;
|
2011-09-18 22:34:03 +08:00
|
|
|
|
2013-12-13 08:51:42 +08:00
|
|
|
inode = btrfs_lookup_dentry(dir, dentry);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
if (PTR_ERR(inode) == -ENOENT)
|
|
|
|
inode = NULL;
|
|
|
|
else
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
}
|
|
|
|
|
2014-10-13 10:24:21 +08:00
|
|
|
return d_splice_alias(inode, dentry);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
unsigned char btrfs_filetype_table[] = {
|
2007-06-12 18:35:45 +08:00
|
|
|
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
|
|
|
|
};
|
|
|
|
|
2013-05-23 04:48:09 +08:00
|
|
|
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2013-05-23 04:48:09 +08:00
|
|
|
struct inode *inode = file_inode(file);
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_item *item;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct btrfs_key found_key;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_path *path;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
struct list_head ins_list;
|
|
|
|
struct list_head del_list;
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 18:35:45 +08:00
|
|
|
int slot;
|
|
|
|
unsigned char d_type;
|
|
|
|
int over = 0;
|
|
|
|
u32 di_cur;
|
|
|
|
u32 di_total;
|
|
|
|
u32 di_len;
|
|
|
|
int key_type = BTRFS_DIR_INDEX_KEY;
|
2007-10-16 04:14:19 +08:00
|
|
|
char tmp_name[32];
|
|
|
|
char *name_ptr;
|
|
|
|
int name_len;
|
2013-05-23 04:48:09 +08:00
|
|
|
int is_curr = 0; /* ctx->pos points to the current index? */
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
/* FIXME, use a real flag for deciding about the key type */
|
|
|
|
if (root->fs_info->tree_root == root)
|
|
|
|
key_type = BTRFS_DIR_ITEM_KEY;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2013-05-23 04:48:09 +08:00
|
|
|
if (!dir_emit_dots(file, ctx))
|
|
|
|
return 0;
|
|
|
|
|
2008-08-18 00:08:36 +08:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2011-05-28 19:00:39 +08:00
|
|
|
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
2008-08-18 00:08:36 +08:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY) {
|
|
|
|
INIT_LIST_HEAD(&ins_list);
|
|
|
|
INIT_LIST_HEAD(&del_list);
|
|
|
|
btrfs_get_delayed_items(inode, &ins_list, &del_list);
|
|
|
|
}
|
|
|
|
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = key_type;
|
2013-05-23 04:48:09 +08:00
|
|
|
key.offset = ctx->pos;
|
2011-04-20 10:31:50 +08:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
2008-08-18 00:08:36 +08:00
|
|
|
|
|
|
|
while (1) {
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
2007-06-12 18:35:45 +08:00
|
|
|
slot = path->slots[0];
|
2011-03-23 10:43:58 +08:00
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-11-18 10:02:50 +08:00
|
|
|
|
2013-09-16 22:58:09 +08:00
|
|
|
item = btrfs_item_nr(slot);
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.objectid != key.objectid)
|
2007-06-12 18:35:45 +08:00
|
|
|
break;
|
2014-06-05 00:41:45 +08:00
|
|
|
if (found_key.type != key_type)
|
2007-06-12 18:35:45 +08:00
|
|
|
break;
|
2013-05-23 04:48:09 +08:00
|
|
|
if (found_key.offset < ctx->pos)
|
2011-03-23 10:43:58 +08:00
|
|
|
goto next;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY &&
|
|
|
|
btrfs_should_delete_dir_index(&del_list,
|
|
|
|
found_key.offset))
|
|
|
|
goto next;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2013-05-23 04:48:09 +08:00
|
|
|
ctx->pos = found_key.offset;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
is_curr = 1;
|
2008-08-18 00:08:36 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
|
|
|
|
di_cur = 0;
|
2007-10-16 04:14:19 +08:00
|
|
|
di_total = btrfs_item_size(leaf, item);
|
2008-08-18 00:08:36 +08:00
|
|
|
|
|
|
|
while (di_cur < di_total) {
|
2007-10-16 04:14:19 +08:00
|
|
|
struct btrfs_key location;
|
|
|
|
|
2011-03-17 04:47:17 +08:00
|
|
|
if (verify_dir_item(root, leaf, di))
|
|
|
|
break;
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
name_len = btrfs_dir_name_len(leaf, di);
|
2008-08-18 00:08:36 +08:00
|
|
|
if (name_len <= sizeof(tmp_name)) {
|
2007-10-16 04:14:19 +08:00
|
|
|
name_ptr = tmp_name;
|
|
|
|
} else {
|
|
|
|
name_ptr = kmalloc(name_len, GFP_NOFS);
|
2008-08-18 00:08:36 +08:00
|
|
|
if (!name_ptr) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err;
|
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
}
|
|
|
|
read_extent_buffer(leaf, name_ptr,
|
|
|
|
(unsigned long)(di + 1), name_len);
|
|
|
|
|
|
|
|
d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &location);
|
2008-11-18 10:02:50 +08:00
|
|
|
|
2012-04-28 02:23:22 +08:00
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
/* is this a reference to our own snapshot? If so
|
2012-02-25 16:09:30 +08:00
|
|
|
* skip it.
|
|
|
|
*
|
|
|
|
* In contrast to old kernels, we insert the snapshot's
|
|
|
|
* dir item and dir index after it has been created, so
|
|
|
|
* we won't find a reference to our own snapshot. We
|
|
|
|
* still keep the following code for backward
|
|
|
|
* compatibility.
|
2008-11-18 10:02:50 +08:00
|
|
|
*/
|
|
|
|
if (location.type == BTRFS_ROOT_ITEM_KEY &&
|
|
|
|
location.objectid == root->root_key.objectid) {
|
|
|
|
over = 0;
|
|
|
|
goto skip;
|
|
|
|
}
|
2013-05-23 04:48:09 +08:00
|
|
|
over = !dir_emit(ctx, name_ptr, name_len,
|
|
|
|
location.objectid, d_type);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
skip:
|
2007-10-16 04:14:19 +08:00
|
|
|
if (name_ptr != tmp_name)
|
|
|
|
kfree(name_ptr);
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
if (over)
|
|
|
|
goto nopos;
|
2007-11-17 00:45:54 +08:00
|
|
|
di_len = btrfs_dir_name_len(leaf, di) +
|
2008-08-18 00:08:36 +08:00
|
|
|
btrfs_dir_data_len(leaf, di) + sizeof(*di);
|
2007-06-12 18:35:45 +08:00
|
|
|
di_cur += di_len;
|
|
|
|
di = (struct btrfs_dir_item *)((char *)di + di_len);
|
|
|
|
}
|
2011-03-23 10:43:58 +08:00
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-08-18 00:08:36 +08:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY) {
|
|
|
|
if (is_curr)
|
2013-05-23 04:48:09 +08:00
|
|
|
ctx->pos++;
|
|
|
|
ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (ret)
|
|
|
|
goto nopos;
|
|
|
|
}
|
|
|
|
|
2008-08-18 00:08:36 +08:00
|
|
|
/* Reached end of directory/root. Bump pos past the last item. */
|
2013-07-12 07:19:42 +08:00
|
|
|
ctx->pos++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stop new entries from being returned after we return the last
|
|
|
|
* entry.
|
|
|
|
*
|
|
|
|
* New directory entries are assigned a strictly increasing
|
|
|
|
* offset. This means that new entries created during readdir
|
|
|
|
* are *guaranteed* to be seen in the future by that readdir.
|
|
|
|
* This has broken buggy programs which operate on names as
|
|
|
|
* they're returned by readdir. Until we re-use freed offsets
|
|
|
|
* we have this hack to stop new entries from being returned
|
|
|
|
* under the assumption that they'll never reach this huge
|
|
|
|
* offset.
|
|
|
|
*
|
|
|
|
* This is being careful not to overflow 32bit loff_t unless the
|
|
|
|
* last entry requires it because doing so has broken 32bit apps
|
|
|
|
* in the past.
|
|
|
|
*/
|
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY) {
|
|
|
|
if (ctx->pos >= INT_MAX)
|
|
|
|
ctx->pos = LLONG_MAX;
|
|
|
|
else
|
|
|
|
ctx->pos = INT_MAX;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
nopos:
|
|
|
|
ret = 0;
|
|
|
|
err:
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY)
|
|
|
|
btrfs_put_delayed_items(&ins_list, &del_list);
|
2007-06-12 18:35:45 +08:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-03-05 16:21:37 +08:00
|
|
|
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
2010-06-22 02:48:16 +08:00
|
|
|
bool nolock = false;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2012-05-24 02:13:11 +08:00
|
|
|
if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
|
2008-08-06 01:30:48 +08:00
|
|
|
return 0;
|
|
|
|
|
2012-07-10 19:28:39 +08:00
|
|
|
if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
|
2011-04-20 10:33:24 +08:00
|
|
|
nolock = true;
|
2010-06-22 02:48:16 +08:00
|
|
|
|
2010-03-05 16:21:37 +08:00
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL) {
|
2010-06-22 02:48:16 +08:00
|
|
|
if (nolock)
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2010-06-22 02:48:16 +08:00
|
|
|
else
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-25 10:51:38 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2012-09-20 15:51:59 +08:00
|
|
|
ret = btrfs_commit_transaction(trans, root);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-06-23 02:16:25 +08:00
|
|
|
* This is somewhat expensive, updating the tree every time the
|
2007-06-12 18:35:45 +08:00
|
|
|
* inode changes. But, it is most likely to find the inode in cache.
|
|
|
|
* FIXME, needs more benchmarking...there are no reasons other than performance
|
|
|
|
* to keep or drop this code.
|
|
|
|
*/
|
2013-04-26 04:41:01 +08:00
|
|
|
static int btrfs_dirty_inode(struct inode *inode)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-16 22:49:58 +08:00
|
|
|
int ret;
|
|
|
|
|
2012-05-24 02:13:11 +08:00
|
|
|
if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
|
2011-11-30 23:45:38 +08:00
|
|
|
return 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-11-30 23:45:38 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2010-05-16 22:49:58 +08:00
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2010-05-26 23:02:00 +08:00
|
|
|
if (ret && ret == -ENOSPC) {
|
|
|
|
/* whoops, lets try again with the full transaction */
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2011-11-30 23:45:38 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2010-05-16 22:49:58 +08:00
|
|
|
|
2010-05-26 23:02:00 +08:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
if (BTRFS_I(inode)->delayed_node)
|
|
|
|
btrfs_balance_delayed_items(root);
|
2011-11-30 23:45:38 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is a copy of file_update_time. We need this so we can return error on
|
|
|
|
* ENOSPC for updating the inode in the case of file write and mmap writes.
|
|
|
|
*/
|
2012-03-26 21:46:47 +08:00
|
|
|
static int btrfs_update_time(struct inode *inode, struct timespec *now,
|
|
|
|
int flags)
|
2011-11-30 23:45:38 +08:00
|
|
|
{
|
2012-06-15 15:49:33 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
|
2012-03-26 21:46:47 +08:00
|
|
|
if (flags & S_VERSION)
|
2011-11-30 23:45:38 +08:00
|
|
|
inode_inc_iversion(inode);
|
2012-03-26 21:46:47 +08:00
|
|
|
if (flags & S_CTIME)
|
|
|
|
inode->i_ctime = *now;
|
|
|
|
if (flags & S_MTIME)
|
|
|
|
inode->i_mtime = *now;
|
|
|
|
if (flags & S_ATIME)
|
|
|
|
inode->i_atime = *now;
|
|
|
|
return btrfs_dirty_inode(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* find the highest existing sequence number in a directory
|
|
|
|
* and then set the in-memory index_cnt variable to reflect
|
|
|
|
* free sequence numbers
|
|
|
|
*/
|
2008-07-25 00:12:38 +08:00
|
|
|
static int btrfs_set_inode_index_count(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
int ret;
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = BTRFS_DIR_INDEX_KEY;
|
2008-07-25 00:12:38 +08:00
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
/* FIXME: we should be able to handle this */
|
|
|
|
if (ret == 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MAGIC NUMBER EXPLANATION:
|
|
|
|
* since we search a directory based on f_pos we have to start at 2
|
|
|
|
* since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
|
|
|
|
* else has to start at 2
|
|
|
|
*/
|
|
|
|
if (path->slots[0] == 0) {
|
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (found_key.objectid != btrfs_ino(inode) ||
|
2014-06-05 00:41:45 +08:00
|
|
|
found_key.type != BTRFS_DIR_INDEX_KEY) {
|
2008-07-25 00:12:38 +08:00
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
BTRFS_I(inode)->index_cnt = found_key.offset + 1;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* helper to find a free sequence number in a given directory. This current
|
|
|
|
* code is very simple, later versions will do smarter things in the btree
|
|
|
|
*/
|
2008-11-18 10:02:50 +08:00
|
|
|
int btrfs_set_inode_index(struct inode *dir, u64 *index)
|
2008-07-25 00:12:38 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (BTRFS_I(dir)->index_cnt == (u64)-1) {
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
ret = btrfs_inode_delayed_dir_index_count(dir);
|
|
|
|
if (ret) {
|
|
|
|
ret = btrfs_set_inode_index_count(dir);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2008-07-25 00:12:38 +08:00
|
|
|
}
|
|
|
|
|
2008-08-05 23:18:09 +08:00
|
|
|
*index = BTRFS_I(dir)->index_cnt;
|
2008-07-25 00:12:38 +08:00
|
|
|
BTRFS_I(dir)->index_cnt++;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-09-09 04:08:51 +08:00
|
|
|
static int btrfs_insert_inode_locked(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args args;
|
|
|
|
args.location = &BTRFS_I(inode)->location;
|
|
|
|
args.root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
return insert_inode_locked4(inode,
|
|
|
|
btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
|
|
|
|
btrfs_find_actor, &args);
|
|
|
|
}
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
2008-07-25 00:12:38 +08:00
|
|
|
struct inode *dir,
|
2008-01-30 04:15:18 +08:00
|
|
|
const char *name, int name_len,
|
2011-07-26 15:30:54 +08:00
|
|
|
u64 ref_objectid, u64 objectid,
|
|
|
|
umode_t mode, u64 *index)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct btrfs_inode_item *inode_item;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_key *location;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct btrfs_path *path;
|
2008-01-30 04:15:18 +08:00
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
struct btrfs_key key[2];
|
|
|
|
u32 sizes[2];
|
2014-04-28 03:40:45 +08:00
|
|
|
int nitems = name ? 2 : 1;
|
2008-01-30 04:15:18 +08:00
|
|
|
unsigned long ptr;
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret;
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
if (!path)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
inode = new_inode(root->fs_info->sb);
|
2011-04-09 10:30:07 +08:00
|
|
|
if (!inode) {
|
|
|
|
btrfs_free_path(path);
|
2007-06-12 18:35:45 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2011-04-09 10:30:07 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2014-08-01 07:10:32 +08:00
|
|
|
/*
|
|
|
|
* O_TMPFILE, set link count to 0, so that after this point,
|
|
|
|
* we fill in an inode item with the correct link count.
|
|
|
|
*/
|
|
|
|
if (!name)
|
|
|
|
set_nlink(inode, 0);
|
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
/*
|
|
|
|
* we have to initialize this early, so we can reclaim the inode
|
|
|
|
* number if we fail afterwards in this function.
|
|
|
|
*/
|
|
|
|
inode->i_ino = objectid;
|
|
|
|
|
2014-04-28 03:40:45 +08:00
|
|
|
if (dir && name) {
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
trace_btrfs_inode_request(dir);
|
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
ret = btrfs_set_inode_index(dir, index);
|
2009-04-03 04:46:06 +08:00
|
|
|
if (ret) {
|
2011-04-09 10:30:07 +08:00
|
|
|
btrfs_free_path(path);
|
2009-04-03 04:46:06 +08:00
|
|
|
iput(inode);
|
2008-07-25 00:12:38 +08:00
|
|
|
return ERR_PTR(ret);
|
2009-04-03 04:46:06 +08:00
|
|
|
}
|
2014-04-28 03:40:45 +08:00
|
|
|
} else if (dir) {
|
|
|
|
*index = 0;
|
2008-07-25 00:12:38 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* index_cnt is ignored for everything but a dir,
|
|
|
|
* btrfs_get_inode_index_count has an explanation for the magic
|
|
|
|
* number
|
|
|
|
*/
|
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
2013-12-26 13:07:06 +08:00
|
|
|
BTRFS_I(inode)->dir_index = *index;
|
2007-06-12 18:35:45 +08:00
|
|
|
BTRFS_I(inode)->root = root;
|
2008-09-06 04:13:11 +08:00
|
|
|
BTRFS_I(inode)->generation = trans->transid;
|
2010-11-19 10:18:02 +08:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
2007-08-28 04:49:44 +08:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
/*
|
|
|
|
* We could have gotten an inode number from somebody who was fsynced
|
|
|
|
* and then removed in this same transaction, so let's just set full
|
|
|
|
* sync since it will be a full sync anyway and this will blow away the
|
|
|
|
* old info in the log.
|
|
|
|
*/
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
|
|
|
|
|
2008-01-30 04:15:18 +08:00
|
|
|
key[0].objectid = objectid;
|
2014-06-05 00:41:45 +08:00
|
|
|
key[0].type = BTRFS_INODE_ITEM_KEY;
|
2008-01-30 04:15:18 +08:00
|
|
|
key[0].offset = 0;
|
|
|
|
|
|
|
|
sizes[0] = sizeof(struct btrfs_inode_item);
|
2014-04-28 03:40:45 +08:00
|
|
|
|
|
|
|
if (name) {
|
|
|
|
/*
|
|
|
|
* Start new inodes with an inode_ref. This is slightly more
|
|
|
|
* efficient for small numbers of hard links since they will
|
|
|
|
* be packed into one item. Extended refs will kick in if we
|
|
|
|
* add more hard links than can fit in the ref item.
|
|
|
|
*/
|
|
|
|
key[1].objectid = objectid;
|
2014-06-05 00:41:45 +08:00
|
|
|
key[1].type = BTRFS_INODE_REF_KEY;
|
2014-04-28 03:40:45 +08:00
|
|
|
key[1].offset = ref_objectid;
|
|
|
|
|
|
|
|
sizes[1] = name_len + sizeof(*ref);
|
|
|
|
}
|
2008-01-30 04:15:18 +08:00
|
|
|
|
2014-09-09 04:08:51 +08:00
|
|
|
location = &BTRFS_I(inode)->location;
|
|
|
|
location->objectid = objectid;
|
|
|
|
location->offset = 0;
|
2014-06-05 00:41:45 +08:00
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
ret = btrfs_insert_inode_locked(inode);
|
|
|
|
if (ret < 0)
|
|
|
|
goto fail;
|
|
|
|
|
2009-03-13 23:00:37 +08:00
|
|
|
path->leave_spinning = 1;
|
2014-04-28 03:40:45 +08:00
|
|
|
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
|
2008-01-30 04:15:18 +08:00
|
|
|
if (ret != 0)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto fail_unlock;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2010-03-04 22:31:47 +08:00
|
|
|
inode_init_owner(inode, dir, mode);
|
2008-10-09 23:46:29 +08:00
|
|
|
inode_set_bytes(inode, 0);
|
2012-07-04 15:18:07 +08:00
|
|
|
|
|
|
|
inode->i_mtime = CURRENT_TIME;
|
|
|
|
inode->i_atime = inode->i_mtime;
|
|
|
|
inode->i_ctime = inode->i_mtime;
|
|
|
|
BTRFS_I(inode)->i_otime = inode->i_mtime;
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
2012-07-10 14:58:58 +08:00
|
|
|
memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
|
|
|
|
sizeof(*inode_item));
|
2008-09-06 04:13:11 +08:00
|
|
|
fill_inode_item(trans, path->nodes[0], inode_item, inode);
|
2008-01-30 04:15:18 +08:00
|
|
|
|
2014-04-28 03:40:45 +08:00
|
|
|
if (name) {
|
|
|
|
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
|
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
|
|
|
|
ptr = (unsigned long)(ref + 1);
|
|
|
|
write_extent_buffer(path->nodes[0], name, ptr, name_len);
|
|
|
|
}
|
2008-01-30 04:15:18 +08:00
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2009-04-17 16:37:41 +08:00
|
|
|
btrfs_inherit_iflags(inode, dir);
|
|
|
|
|
2011-07-25 05:08:40 +08:00
|
|
|
if (S_ISREG(mode)) {
|
2009-07-03 00:26:06 +08:00
|
|
|
if (btrfs_test_opt(root, NODATASUM))
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
|
2012-09-11 22:33:50 +08:00
|
|
|
if (btrfs_test_opt(root, NODATACOW))
|
2013-02-22 04:28:28 +08:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
|
|
|
|
BTRFS_INODE_NODATASUM;
|
2009-07-03 00:26:06 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
inode_tree_add(inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
|
|
|
|
trace_btrfs_inode_new(inode);
|
2011-06-25 01:13:29 +08:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
|
2012-07-25 23:35:53 +08:00
|
|
|
btrfs_update_root_times(trans, root);
|
|
|
|
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
ret = btrfs_inode_inherit_props(trans, inode, dir);
|
|
|
|
if (ret)
|
|
|
|
btrfs_err(root->fs_info,
|
|
|
|
"error inheriting props for ino %llu (root %llu): %d",
|
|
|
|
btrfs_ino(inode), root->root_key.objectid, ret);
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
return inode;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
fail_unlock:
|
|
|
|
unlock_new_inode(inode);
|
2007-10-16 04:14:19 +08:00
|
|
|
fail:
|
2014-04-28 03:40:45 +08:00
|
|
|
if (dir && name)
|
2008-07-25 00:12:38 +08:00
|
|
|
BTRFS_I(dir)->index_cnt--;
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_free_path(path);
|
2009-04-03 04:46:06 +08:00
|
|
|
iput(inode);
|
2007-10-16 04:14:19 +08:00
|
|
|
return ERR_PTR(ret);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline u8 btrfs_inode_type(struct inode *inode)
|
|
|
|
{
|
|
|
|
return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* utility function to add 'inode' into 'parent_inode' with
|
|
|
|
* a give name and a given sequence number.
|
|
|
|
* if 'add_backref' is true, also insert a backref from the
|
|
|
|
* inode to the parent directory.
|
|
|
|
*/
|
2008-09-06 04:13:11 +08:00
|
|
|
int btrfs_add_link(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *parent_inode, struct inode *inode,
|
|
|
|
const char *name, int name_len, int add_backref, u64 index)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2009-09-22 03:56:00 +08:00
|
|
|
int ret = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_key key;
|
2008-09-06 04:13:11 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 parent_ino = btrfs_ino(parent_inode);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-22 03:56:00 +08:00
|
|
|
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
|
|
|
|
} else {
|
2011-04-20 10:31:50 +08:00
|
|
|
key.objectid = ino;
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = BTRFS_INODE_ITEM_KEY;
|
2009-09-22 03:56:00 +08:00
|
|
|
key.offset = 0;
|
|
|
|
}
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-22 03:56:00 +08:00
|
|
|
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
|
|
|
|
key.objectid, root->root_key.objectid,
|
2011-04-20 10:31:50 +08:00
|
|
|
parent_ino, index, name, name_len);
|
2009-09-22 03:56:00 +08:00
|
|
|
} else if (add_backref) {
|
2011-04-20 10:31:50 +08:00
|
|
|
ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
parent_ino, index);
|
2009-09-22 03:56:00 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
/* Nothing to clean up yet */
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2009-09-22 03:56:00 +08:00
|
|
|
|
2012-03-12 23:03:00 +08:00
|
|
|
ret = btrfs_insert_dir_item(trans, root, name, name_len,
|
|
|
|
parent_inode, &key,
|
|
|
|
btrfs_inode_type(inode), index);
|
2012-12-18 03:26:57 +08:00
|
|
|
if (ret == -EEXIST || ret == -EOVERFLOW)
|
2012-03-12 23:03:00 +08:00
|
|
|
goto fail_dir_item;
|
|
|
|
else if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
return ret;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
|
|
|
|
btrfs_i_size_write(parent_inode, parent_inode->i_size +
|
|
|
|
name_len * 2);
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(parent_inode);
|
2012-03-12 23:03:00 +08:00
|
|
|
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
|
|
|
|
ret = btrfs_update_inode(trans, root, parent_inode);
|
|
|
|
if (ret)
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
2007-06-12 18:35:45 +08:00
|
|
|
return ret;
|
2012-02-20 21:40:56 +08:00
|
|
|
|
|
|
|
fail_dir_item:
|
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
|
|
|
u64 local_index;
|
|
|
|
int err;
|
|
|
|
err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
|
|
|
|
key.objectid, root->root_key.objectid,
|
|
|
|
parent_ino, &local_index, name, name_len);
|
|
|
|
|
|
|
|
} else if (add_backref) {
|
|
|
|
u64 local_index;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = btrfs_del_inode_ref(trans, root, name, name_len,
|
|
|
|
ino, parent_ino, &local_index);
|
|
|
|
}
|
|
|
|
return ret;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
|
2010-11-20 04:36:11 +08:00
|
|
|
struct inode *dir, struct dentry *dentry,
|
|
|
|
struct inode *inode, int backref, u64 index)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2010-11-20 04:36:11 +08:00
|
|
|
int err = btrfs_add_link(trans, dir, inode,
|
|
|
|
dentry->d_name.name, dentry->d_name.len,
|
|
|
|
backref, index);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (err > 0)
|
|
|
|
err = -EEXIST;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2007-07-11 22:18:17 +08:00
|
|
|
static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
|
2011-07-26 13:52:52 +08:00
|
|
|
umode_t mode, dev_t rdev)
|
2007-07-11 22:18:17 +08:00
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-12-22 05:27:21 +08:00
|
|
|
struct inode *inode = NULL;
|
2007-07-11 22:18:17 +08:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
u64 objectid;
|
2008-08-05 23:18:09 +08:00
|
|
|
u64 index = 0;
|
2007-07-11 22:18:17 +08:00
|
|
|
|
2009-09-12 04:12:44 +08:00
|
|
|
/*
|
|
|
|
* 2 for inode item and ref
|
|
|
|
* 2 for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 22:48:46 +08:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-12-22 05:27:21 +08:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-25 00:12:38 +08:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-20 10:31:50 +08:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-12 03:26:06 +08:00
|
|
|
mode, &index);
|
2011-04-26 07:43:53 +08:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-07-11 22:18:17 +08:00
|
|
|
goto out_unlock;
|
2011-04-26 07:43:53 +08:00
|
|
|
}
|
2007-07-11 22:18:17 +08:00
|
|
|
|
2011-12-15 23:09:07 +08:00
|
|
|
/*
|
|
|
|
* If the active LSM wants to access the inode during
|
|
|
|
* d_instantiate it needs these. Smack checks to see
|
|
|
|
* if the filesystem supports xattrs by looking at the
|
|
|
|
* ops vector.
|
|
|
|
*/
|
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
2014-09-09 04:08:51 +08:00
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
|
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2007-07-11 22:18:17 +08:00
|
|
|
if (err)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_unlock_inode;
|
|
|
|
|
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
|
|
|
|
if (err) {
|
|
|
|
goto out_unlock_inode;
|
|
|
|
} else {
|
2007-08-29 21:11:44 +08:00
|
|
|
btrfs_update_inode(trans, root, inode);
|
2014-09-09 04:08:51 +08:00
|
|
|
unlock_new_inode(inode);
|
2011-12-23 20:58:13 +08:00
|
|
|
d_instantiate(dentry, inode);
|
2007-07-11 22:18:17 +08:00
|
|
|
}
|
2014-09-09 04:08:51 +08:00
|
|
|
|
2007-07-11 22:18:17 +08:00
|
|
|
out_unlock:
|
2012-01-13 08:10:12 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2014-02-07 05:06:06 +08:00
|
|
|
btrfs_balance_delayed_items(root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-07-11 22:18:17 +08:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
return err;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
out_unlock_inode:
|
|
|
|
drop_inode = 1;
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_unlock;
|
|
|
|
|
2007-07-11 22:18:17 +08:00
|
|
|
}
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
static int btrfs_create(struct inode *dir, struct dentry *dentry,
|
2012-06-11 06:05:36 +08:00
|
|
|
umode_t mode, bool excl)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-12-22 05:27:21 +08:00
|
|
|
struct inode *inode = NULL;
|
2012-11-30 11:40:09 +08:00
|
|
|
int drop_inode_on_err = 0;
|
2010-05-16 22:48:46 +08:00
|
|
|
int err;
|
2007-06-12 18:35:45 +08:00
|
|
|
u64 objectid;
|
2008-08-05 23:18:09 +08:00
|
|
|
u64 index = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-12 04:12:44 +08:00
|
|
|
/*
|
|
|
|
* 2 for inode item and ref
|
|
|
|
* 2 for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 22:48:46 +08:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2009-09-12 04:12:44 +08:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-25 00:12:38 +08:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-20 10:31:50 +08:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-12 03:26:06 +08:00
|
|
|
mode, &index);
|
2011-04-26 07:43:53 +08:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
goto out_unlock;
|
2011-04-26 07:43:53 +08:00
|
|
|
}
|
2012-11-30 11:40:09 +08:00
|
|
|
drop_inode_on_err = 1;
|
2011-12-15 23:09:07 +08:00
|
|
|
/*
|
|
|
|
* If the active LSM wants to access the inode during
|
|
|
|
* d_instantiate it needs these. Smack checks to see
|
|
|
|
* if the filesystem supports xattrs by looking at the
|
|
|
|
* ops vector.
|
|
|
|
*/
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
2014-09-09 04:08:51 +08:00
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
|
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock_inode;
|
|
|
|
|
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock_inode;
|
2011-12-15 23:09:07 +08:00
|
|
|
|
2010-11-20 04:36:11 +08:00
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (err)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_unlock_inode;
|
2012-11-30 11:40:09 +08:00
|
|
|
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2014-09-09 04:08:51 +08:00
|
|
|
unlock_new_inode(inode);
|
2012-11-30 11:40:09 +08:00
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
out_unlock:
|
2012-01-13 08:10:12 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2012-11-30 11:40:09 +08:00
|
|
|
if (err && drop_inode_on_err) {
|
2007-06-12 18:35:45 +08:00
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2014-02-07 05:06:06 +08:00
|
|
|
btrfs_balance_delayed_items(root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-06-12 18:35:45 +08:00
|
|
|
return err;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
out_unlock_inode:
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_unlock;
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
|
|
|
|
struct dentry *dentry)
|
|
|
|
{
|
2016-01-06 00:24:05 +08:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *inode = d_inode(old_dentry);
|
2008-08-05 23:18:09 +08:00
|
|
|
u64 index;
|
2007-06-12 18:35:45 +08:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
|
2009-11-12 15:14:26 +08:00
|
|
|
/* do not allow sys_link's with other subvols of the same device */
|
|
|
|
if (root->objectid != BTRFS_I(inode)->root->objectid)
|
2011-03-23 01:20:26 +08:00
|
|
|
return -EXDEV;
|
2009-11-12 15:14:26 +08:00
|
|
|
|
2012-08-09 02:32:27 +08:00
|
|
|
if (inode->i_nlink >= BTRFS_LINK_MAX)
|
2011-03-05 01:15:18 +08:00
|
|
|
return -EMLINK;
|
2009-11-12 15:14:26 +08:00
|
|
|
|
2008-11-18 10:02:50 +08:00
|
|
|
err = btrfs_set_inode_index(dir, &index);
|
2008-07-25 00:12:38 +08:00
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
|
2010-05-16 22:48:46 +08:00
|
|
|
/*
|
2011-02-18 17:21:17 +08:00
|
|
|
* 2 items for inode and inode ref
|
2010-05-16 22:48:46 +08:00
|
|
|
* 2 items for dir items
|
2011-02-18 17:21:17 +08:00
|
|
|
* 1 item for parent inode
|
2010-05-16 22:48:46 +08:00
|
|
|
*/
|
2011-02-18 17:21:17 +08:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
2016-01-06 00:24:05 +08:00
|
|
|
trans = NULL;
|
2010-05-16 22:48:46 +08:00
|
|
|
goto fail;
|
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
/* There are several dir indexes for this inode, clear the cache. */
|
|
|
|
BTRFS_I(inode)->dir_index = 0ULL;
|
2013-10-17 03:10:34 +08:00
|
|
|
inc_nlink(inode);
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(inode);
|
2011-04-13 13:19:21 +08:00
|
|
|
inode->i_ctime = CURRENT_TIME;
|
2010-10-23 23:11:40 +08:00
|
|
|
ihold(inode);
|
2012-10-12 03:53:56 +08:00
|
|
|
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
|
2008-07-25 00:12:38 +08:00
|
|
|
|
2010-11-20 04:36:11 +08:00
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2009-09-24 21:17:31 +08:00
|
|
|
if (err) {
|
2007-06-23 02:16:25 +08:00
|
|
|
drop_inode = 1;
|
2009-09-24 21:17:31 +08:00
|
|
|
} else {
|
2011-07-17 11:09:10 +08:00
|
|
|
struct dentry *parent = dentry->d_parent;
|
2009-09-24 21:17:31 +08:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (err)
|
|
|
|
goto fail;
|
2014-04-28 03:40:45 +08:00
|
|
|
if (inode->i_nlink == 1) {
|
|
|
|
/*
|
|
|
|
* If new hard link count is 1, it's a file created
|
|
|
|
* with open(2) O_TMPFILE flag.
|
|
|
|
*/
|
|
|
|
err = btrfs_orphan_del(trans, inode);
|
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
}
|
2011-12-23 20:58:13 +08:00
|
|
|
d_instantiate(dentry, inode);
|
2010-11-20 17:48:00 +08:00
|
|
|
btrfs_log_new_name(trans, inode, NULL, parent);
|
2009-09-24 21:17:31 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2014-02-07 05:06:06 +08:00
|
|
|
btrfs_balance_delayed_items(root);
|
2007-12-22 05:27:21 +08:00
|
|
|
fail:
|
2016-01-06 00:24:05 +08:00
|
|
|
if (trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-06-12 18:35:45 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2011-07-26 13:41:39 +08:00
|
|
|
static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2008-05-03 04:13:49 +08:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
int err = 0;
|
|
|
|
int drop_on_err = 0;
|
2008-05-03 04:13:49 +08:00
|
|
|
u64 objectid = 0;
|
2008-08-05 23:18:09 +08:00
|
|
|
u64 index = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-12 04:12:44 +08:00
|
|
|
/*
|
|
|
|
* 2 items for inode and ref
|
|
|
|
* 2 items for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 22:48:46 +08:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_fail;
|
|
|
|
|
2008-07-25 00:12:38 +08:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-20 10:31:50 +08:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-12 03:26:06 +08:00
|
|
|
S_IFDIR | mode, &index);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
drop_on_err = 1;
|
2014-09-09 04:08:51 +08:00
|
|
|
/* these must be set before we unlock the inode */
|
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
2008-07-25 00:16:36 +08:00
|
|
|
|
2011-02-02 00:05:39 +08:00
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2008-07-25 00:16:36 +08:00
|
|
|
if (err)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_fail_inode;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2008-07-18 00:54:05 +08:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2007-06-12 18:35:45 +08:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (err)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_fail_inode;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2010-11-20 04:36:11 +08:00
|
|
|
err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
|
|
|
|
dentry->d_name.len, 0, index);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (err)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_fail_inode;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
d_instantiate(dentry, inode);
|
2014-09-09 04:08:51 +08:00
|
|
|
/*
|
|
|
|
* mkdir is special. We're unlocking after we call d_instantiate
|
|
|
|
* to avoid a race with nfsd calling d_instantiate.
|
|
|
|
*/
|
|
|
|
unlock_new_inode(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
drop_on_err = 0;
|
|
|
|
|
|
|
|
out_fail:
|
2012-01-13 08:10:12 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2014-12-24 14:45:30 +08:00
|
|
|
if (drop_on_err) {
|
|
|
|
inode_dec_link_count(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
iput(inode);
|
2014-12-24 14:45:30 +08:00
|
|
|
}
|
2014-02-07 05:06:06 +08:00
|
|
|
btrfs_balance_delayed_items(root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-06-12 18:35:45 +08:00
|
|
|
return err;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
out_fail_inode:
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_fail;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
/* Find next extent map of a given extent map, caller needs to ensure locks */
|
|
|
|
static struct extent_map *next_extent_map(struct extent_map *em)
|
|
|
|
{
|
|
|
|
struct rb_node *next;
|
|
|
|
|
|
|
|
next = rb_next(&em->rb_node);
|
|
|
|
if (!next)
|
|
|
|
return NULL;
|
|
|
|
return container_of(next, struct extent_map, rb_node);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct extent_map *prev_extent_map(struct extent_map *em)
|
|
|
|
{
|
|
|
|
struct rb_node *prev;
|
|
|
|
|
|
|
|
prev = rb_prev(&em->rb_node);
|
|
|
|
if (!prev)
|
|
|
|
return NULL;
|
|
|
|
return container_of(prev, struct extent_map, rb_node);
|
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/* helper for btfs_get_extent. Given an existing extent in the tree,
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
* the existing extent is the nearest extent to map_start,
|
2008-09-30 03:18:18 +08:00
|
|
|
* and an extent that you want to insert, deal with overlap and insert
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
* the best fitted new extent into the tree.
|
2008-09-30 03:18:18 +08:00
|
|
|
*/
|
2008-04-17 23:29:12 +08:00
|
|
|
static int merge_extent_mapping(struct extent_map_tree *em_tree,
|
|
|
|
struct extent_map *existing,
|
2008-07-18 00:53:50 +08:00
|
|
|
struct extent_map *em,
|
btrfs: Use right extent length when inserting overlap extent map.
When current btrfs finds that a new extent map is going to be insereted
but failed with -EEXIST, it will try again to insert the extent map
but with the length of sectorsize.
This is OK if we don't enable 'no-holes' feature since all extent space
is continuous, we will not go into the not found->insert routine.
But if we enable 'no-holes' feature, it will make things out of control.
e.g. in 4K sectorsize, we pass the following args to btrfs_get_extent():
btrfs_get_extent() args: start: 27874 len 4100
28672 27874 28672 27874+4100 32768
|-----------------------|
|---------hole--------------------|---------data----------|
1) not found and insert
Since no extent map containing the range, btrfs_get_extent() will go
into the not_found and insert routine, which will try to insert the
extent map (27874, 27847 + 4100).
2) first overlap
But it overlaps with (28672, 32768) extent, so -EEXIST will be returned
by add_extent_mapping().
3) retry but still overlap
After catching the -EEXIST, then btrfs_get_extent() will try insert it
again but with 4K length, which still overlaps, so -EEXIST will be
returned.
This makes the following patch fail to punch hole.
d77815461f047e561f77a07754ae923ade597d4e btrfs: Avoid trucating page or punching hole in a already existed hole.
This patch will use the right length, which is the (exsisting->start -
em->start) to insert, making the above patch works in 'no-holes' mode.
Also, some small code style problems in above patch is fixed too.
Reported-by: Filipe David Manana <fdmanana@gmail.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe David Manana <fdmanana@suse.com>
Tested-by: Filipe David Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-08 13:06:20 +08:00
|
|
|
u64 map_start)
|
2008-04-17 23:29:12 +08:00
|
|
|
{
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
struct extent_map *prev;
|
|
|
|
struct extent_map *next;
|
|
|
|
u64 start;
|
|
|
|
u64 end;
|
2008-04-17 23:29:12 +08:00
|
|
|
u64 start_diff;
|
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
|
|
|
|
if (existing->start > map_start) {
|
|
|
|
next = existing;
|
|
|
|
prev = prev_extent_map(next);
|
|
|
|
} else {
|
|
|
|
prev = existing;
|
|
|
|
next = next_extent_map(prev);
|
|
|
|
}
|
|
|
|
|
|
|
|
start = prev ? extent_map_end(prev) : em->start;
|
|
|
|
start = max_t(u64, start, em->start);
|
|
|
|
end = next ? next->start : extent_map_end(em);
|
|
|
|
end = min_t(u64, end, extent_map_end(em));
|
|
|
|
start_diff = start - em->start;
|
|
|
|
em->start = start;
|
|
|
|
em->len = end - start;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
|
|
|
|
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
|
2008-07-18 00:53:50 +08:00
|
|
|
em->block_start += start_diff;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
em->block_len -= start_diff;
|
|
|
|
}
|
2013-04-06 04:51:15 +08:00
|
|
|
return add_extent_mapping(em_tree, em, 0);
|
2008-04-17 23:29:12 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
static noinline int uncompress_inline(struct btrfs_path *path,
|
2015-05-19 22:46:45 +08:00
|
|
|
struct page *page,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
size_t pg_offset, u64 extent_offset,
|
|
|
|
struct btrfs_file_extent_item *item)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
char *tmp;
|
|
|
|
size_t max_size;
|
|
|
|
unsigned long inline_size;
|
|
|
|
unsigned long ptr;
|
2010-12-17 14:21:50 +08:00
|
|
|
int compress_type;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
|
|
|
|
WARN_ON(pg_offset != 0);
|
2010-12-17 14:21:50 +08:00
|
|
|
compress_type = btrfs_file_extent_compression(leaf, item);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
max_size = btrfs_file_extent_ram_bytes(leaf, item);
|
|
|
|
inline_size = btrfs_file_extent_inline_item_len(leaf,
|
2013-09-16 22:58:09 +08:00
|
|
|
btrfs_item_nr(path->slots[0]));
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
tmp = kmalloc(inline_size, GFP_NOFS);
|
2011-04-26 07:43:52 +08:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
ptr = btrfs_file_extent_inline_start(item);
|
|
|
|
|
|
|
|
read_extent_buffer(leaf, tmp, ptr, inline_size);
|
|
|
|
|
2008-11-11 22:34:41 +08:00
|
|
|
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
|
2010-12-17 14:21:50 +08:00
|
|
|
ret = btrfs_decompress(compress_type, tmp, page,
|
|
|
|
extent_offset, inline_size, max_size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
kfree(tmp);
|
2014-05-10 05:15:10 +08:00
|
|
|
return ret;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* a bit scary, this does extent mapping from logical file offset to the disk.
|
2009-01-06 10:25:51 +08:00
|
|
|
* the ugly parts come from merging extents from the disk with the in-ram
|
|
|
|
* representation. This gets more complex because of the data=ordered code,
|
2008-09-30 03:18:18 +08:00
|
|
|
* where the in-ram extents might be locked pending data=ordered completion.
|
|
|
|
*
|
|
|
|
* This also copies inline extents directly into the page.
|
|
|
|
*/
|
2009-01-06 10:25:51 +08:00
|
|
|
|
2007-08-28 04:49:44 +08:00
|
|
|
struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
|
2008-01-29 22:59:12 +08:00
|
|
|
size_t pg_offset, u64 start, u64 len,
|
2007-08-28 04:49:44 +08:00
|
|
|
int create)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int err = 0;
|
|
|
|
u64 extent_start = 0;
|
|
|
|
u64 extent_end = 0;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 objectid = btrfs_ino(inode);
|
2007-08-28 04:49:44 +08:00
|
|
|
u32 found_type;
|
2008-07-22 23:18:09 +08:00
|
|
|
struct btrfs_path *path = NULL;
|
2007-08-28 04:49:44 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_file_extent_item *item;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key found_key;
|
2007-08-28 04:49:44 +08:00
|
|
|
struct extent_map *em = NULL;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2007-08-28 04:49:44 +08:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2014-06-09 10:48:05 +08:00
|
|
|
const bool new_inline = !page || create;
|
2007-08-28 04:49:44 +08:00
|
|
|
|
|
|
|
again:
|
2009-09-03 04:24:52 +08:00
|
|
|
read_lock(&em_tree->lock);
|
2008-01-25 05:13:08 +08:00
|
|
|
em = lookup_extent_mapping(em_tree, start, len);
|
2008-05-07 23:43:44 +08:00
|
|
|
if (em)
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2009-09-03 04:24:52 +08:00
|
|
|
read_unlock(&em_tree->lock);
|
2008-01-25 05:13:08 +08:00
|
|
|
|
2007-08-28 04:49:44 +08:00
|
|
|
if (em) {
|
2008-04-23 01:26:46 +08:00
|
|
|
if (em->start > start || em->start + em->len <= start)
|
|
|
|
free_extent_map(em);
|
|
|
|
else if (em->block_start == EXTENT_MAP_INLINE && page)
|
2008-01-29 22:59:12 +08:00
|
|
|
free_extent_map(em);
|
|
|
|
else
|
|
|
|
goto out;
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
2011-04-21 06:48:27 +08:00
|
|
|
em = alloc_extent_map();
|
2007-08-28 04:49:44 +08:00
|
|
|
if (!em) {
|
2008-01-25 05:13:08 +08:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2008-01-25 05:13:08 +08:00
|
|
|
em->start = EXTENT_MAP_HOLE;
|
2008-11-11 00:53:33 +08:00
|
|
|
em->orig_start = EXTENT_MAP_HOLE;
|
2008-01-25 05:13:08 +08:00
|
|
|
em->len = (u64)-1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
em->block_len = (u64)-1;
|
2008-07-22 23:18:09 +08:00
|
|
|
|
|
|
|
if (!path) {
|
|
|
|
path = btrfs_alloc_path();
|
2011-05-13 22:32:11 +08:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Chances are we'll be called again, so go ahead and do
|
|
|
|
* readahead
|
|
|
|
*/
|
2015-11-27 23:31:35 +08:00
|
|
|
path->reada = READA_FORWARD;
|
2008-07-22 23:18:09 +08:00
|
|
|
}
|
|
|
|
|
2007-11-01 23:28:41 +08:00
|
|
|
ret = btrfs_lookup_file_extent(trans, root, path,
|
|
|
|
objectid, start, trans != NULL);
|
2007-08-28 04:49:44 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret != 0) {
|
|
|
|
if (path->slots[0] == 0)
|
|
|
|
goto not_found;
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0],
|
2007-08-28 04:49:44 +08:00
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
/* are we inside the extent that was found? */
|
2007-10-16 04:14:19 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
2014-06-05 00:41:45 +08:00
|
|
|
found_type = found_key.type;
|
2007-10-16 04:14:19 +08:00
|
|
|
if (found_key.objectid != objectid ||
|
2007-08-28 04:49:44 +08:00
|
|
|
found_type != BTRFS_EXTENT_DATA_KEY) {
|
2013-10-15 00:08:38 +08:00
|
|
|
/*
|
|
|
|
* If we backup past the first extent we want to move forward
|
|
|
|
* and see if there is an extent in front of us, otherwise we'll
|
|
|
|
* say there is a hole for our whole search range which can
|
|
|
|
* cause problems.
|
|
|
|
*/
|
|
|
|
extent_end = start;
|
|
|
|
goto next;
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
|
|
|
|
2007-10-16 04:14:19 +08:00
|
|
|
found_type = btrfs_file_extent_type(leaf, item);
|
|
|
|
extent_start = found_key.offset;
|
2008-10-31 02:25:28 +08:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2007-08-28 04:49:44 +08:00
|
|
|
extent_end = extent_start +
|
2007-10-16 04:15:53 +08:00
|
|
|
btrfs_file_extent_num_bytes(leaf, item);
|
2008-10-31 02:19:41 +08:00
|
|
|
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
size_t size;
|
2014-01-04 13:07:00 +08:00
|
|
|
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
|
2013-02-26 16:10:22 +08:00
|
|
|
extent_end = ALIGN(extent_start + size, root->sectorsize);
|
2008-10-31 02:19:41 +08:00
|
|
|
}
|
2013-10-15 00:08:38 +08:00
|
|
|
next:
|
2008-10-31 02:19:41 +08:00
|
|
|
if (start >= extent_end) {
|
|
|
|
path->slots[0]++;
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
2008-10-31 02:19:41 +08:00
|
|
|
if (ret > 0)
|
|
|
|
goto not_found;
|
|
|
|
leaf = path->nodes[0];
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
2008-10-31 02:19:41 +08:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
if (found_key.objectid != objectid ||
|
|
|
|
found_key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto not_found;
|
|
|
|
if (start + len <= found_key.offset)
|
|
|
|
goto not_found;
|
2014-07-17 11:44:14 +08:00
|
|
|
if (start > found_key.offset)
|
|
|
|
goto next;
|
2008-10-31 02:19:41 +08:00
|
|
|
em->start = start;
|
2012-10-12 04:54:30 +08:00
|
|
|
em->orig_start = start;
|
2008-10-31 02:19:41 +08:00
|
|
|
em->len = found_key.offset - start;
|
|
|
|
goto not_found_em;
|
|
|
|
}
|
|
|
|
|
2014-06-09 10:48:05 +08:00
|
|
|
btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
|
|
|
|
|
2008-10-31 02:25:28 +08:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2007-08-28 04:49:44 +08:00
|
|
|
goto insert;
|
|
|
|
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-16 04:14:19 +08:00
|
|
|
unsigned long ptr;
|
2007-08-28 04:49:44 +08:00
|
|
|
char *map;
|
2007-10-16 04:18:25 +08:00
|
|
|
size_t size;
|
|
|
|
size_t extent_offset;
|
|
|
|
size_t copy_size;
|
2007-08-28 04:49:44 +08:00
|
|
|
|
2014-06-09 10:48:05 +08:00
|
|
|
if (new_inline)
|
2007-10-29 23:41:07 +08:00
|
|
|
goto out;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2014-01-04 13:07:00 +08:00
|
|
|
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
|
2008-10-31 02:19:41 +08:00
|
|
|
extent_offset = page_offset(page) + pg_offset - extent_start;
|
2008-01-29 22:59:12 +08:00
|
|
|
copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
|
2007-10-16 04:18:25 +08:00
|
|
|
size - extent_offset);
|
|
|
|
em->start = extent_start + extent_offset;
|
2013-02-26 16:10:22 +08:00
|
|
|
em->len = ALIGN(copy_size, root->sectorsize);
|
2012-12-03 23:31:19 +08:00
|
|
|
em->orig_block_len = em->len;
|
2012-10-12 04:54:30 +08:00
|
|
|
em->orig_start = em->start;
|
2007-10-29 23:41:07 +08:00
|
|
|
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
|
2007-11-01 23:28:41 +08:00
|
|
|
if (create == 0 && !PageUptodate(page)) {
|
2010-12-17 14:21:50 +08:00
|
|
|
if (btrfs_file_extent_compression(leaf, item) !=
|
|
|
|
BTRFS_COMPRESS_NONE) {
|
2015-05-19 22:46:45 +08:00
|
|
|
ret = uncompress_inline(path, page, pg_offset,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
extent_offset, item);
|
2014-05-10 05:15:10 +08:00
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
} else {
|
|
|
|
map = kmap(page);
|
|
|
|
read_extent_buffer(leaf, map + pg_offset, ptr,
|
|
|
|
copy_size);
|
2009-09-12 00:36:29 +08:00
|
|
|
if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
|
|
|
|
memset(map + pg_offset + copy_size, 0,
|
|
|
|
PAGE_CACHE_SIZE - pg_offset -
|
|
|
|
copy_size);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
kunmap(page);
|
|
|
|
}
|
2007-11-01 23:28:41 +08:00
|
|
|
flush_dcache_page(page);
|
|
|
|
} else if (create && PageUptodate(page)) {
|
2011-12-01 21:35:19 +08:00
|
|
|
BUG();
|
2007-11-01 23:28:41 +08:00
|
|
|
if (!trans) {
|
|
|
|
kunmap(page);
|
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
2011-05-28 19:00:39 +08:00
|
|
|
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2011-04-14 00:54:33 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-05-28 19:00:39 +08:00
|
|
|
|
2011-01-25 10:51:38 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return ERR_CAST(trans);
|
2007-11-01 23:28:41 +08:00
|
|
|
goto again;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
map = kmap(page);
|
2008-01-29 22:59:12 +08:00
|
|
|
write_extent_buffer(leaf, map + pg_offset, ptr,
|
2007-11-01 23:28:41 +08:00
|
|
|
copy_size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
kunmap(page);
|
2007-11-01 23:28:41 +08:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
2008-01-25 05:13:08 +08:00
|
|
|
set_extent_uptodate(io_tree, em->start,
|
2011-04-06 18:02:20 +08:00
|
|
|
extent_map_end(em) - 1, NULL, GFP_NOFS);
|
2007-08-28 04:49:44 +08:00
|
|
|
goto insert;
|
|
|
|
}
|
|
|
|
not_found:
|
|
|
|
em->start = start;
|
2012-10-12 04:54:30 +08:00
|
|
|
em->orig_start = start;
|
2008-01-25 05:13:08 +08:00
|
|
|
em->len = len;
|
2007-08-28 04:49:44 +08:00
|
|
|
not_found_em:
|
2007-10-16 04:14:19 +08:00
|
|
|
em->block_start = EXTENT_MAP_HOLE;
|
2008-10-31 02:19:41 +08:00
|
|
|
set_bit(EXTENT_FLAG_VACANCY, &em->flags);
|
2007-08-28 04:49:44 +08:00
|
|
|
insert:
|
2011-04-21 07:20:15 +08:00
|
|
|
btrfs_release_path(path);
|
2008-01-25 05:13:08 +08:00
|
|
|
if (em->start > start || extent_map_end(em) <= start) {
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
|
2013-08-20 19:20:07 +08:00
|
|
|
em->start, em->len, start, len);
|
2007-08-28 04:49:44 +08:00
|
|
|
err = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
2008-01-25 05:13:08 +08:00
|
|
|
|
|
|
|
err = 0;
|
2009-09-03 04:24:52 +08:00
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
ret = add_extent_mapping(em_tree, em, 0);
|
2008-04-17 23:29:12 +08:00
|
|
|
/* it is possible that someone inserted the extent into the tree
|
|
|
|
* while we had the lock dropped. It is also possible that
|
|
|
|
* an overlapping map exists in the tree
|
|
|
|
*/
|
2007-08-28 04:49:44 +08:00
|
|
|
if (ret == -EEXIST) {
|
2008-04-17 23:29:12 +08:00
|
|
|
struct extent_map *existing;
|
2008-07-18 00:53:50 +08:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
existing = search_extent_mapping(em_tree, start, len);
|
|
|
|
/*
|
|
|
|
* existing will always be non-NULL, since there must be
|
|
|
|
* extent causing the -EEXIST.
|
|
|
|
*/
|
|
|
|
if (start >= extent_map_end(existing) ||
|
2014-09-22 09:13:03 +08:00
|
|
|
start <= existing->start) {
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
/*
|
|
|
|
* The existing extent map is the one nearest to
|
|
|
|
* the [start, start + len) range which overlaps
|
|
|
|
*/
|
|
|
|
err = merge_extent_mapping(em_tree, existing,
|
|
|
|
em, start);
|
2008-04-23 01:26:46 +08:00
|
|
|
free_extent_map(existing);
|
btrfs: Fix and enhance merge_extent_mapping() to insert best fitted extent map
The following commit enhanced the merge_extent_mapping() to reduce
fragment in extent map tree, but it can't handle case which existing
lies before map_start:
51f39 btrfs: Use right extent length when inserting overlap extent map.
[BUG]
When existing extent map's start is before map_start,
the em->len will be minus, which will corrupt the extent map and fail to
insert the new extent map.
This will happen when someone get a large extent map, but when it is
going to insert it into extent map tree, some one has already commit
some write and split the huge extent into small parts.
[REPRODUCER]
It is very easy to tiger using filebench with randomrw personality.
It is about 100% to reproduce when using 8G preallocated file in 60s
randonrw test.
[FIX]
This patch can now handle any existing extent position.
Since it does not directly use existing->start, now it will find the
previous and next extent around map_start.
So the old existing->start < map_start bug will never happen again.
[ENHANCE]
This patch will insert the best fitted extent map into extent map tree,
other than the oldest [map_start, map_start + sectorsize) or the
relatively newer but not perfect [map_start, existing->start).
The patch will first search existing extent that does not intersects with
the desired map range [map_start, map_start + len).
The existing extent will be either before or behind map_start, and based
on the existing extent, we can find out the previous and next extent
around map_start.
So the best fitted extent would be [prev->end, next->start).
For prev or next is not found, em->start would be prev->end and em->end
wold be next->start.
With this patch, the fragment in extent map tree should be reduced much
more than the 51f39 commit and reduce an unneeded extent map tree search.
Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-17 11:53:35 +08:00
|
|
|
if (err) {
|
2008-04-17 23:29:12 +08:00
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = existing;
|
2008-07-18 00:53:50 +08:00
|
|
|
err = 0;
|
2007-08-28 04:49:44 +08:00
|
|
|
}
|
|
|
|
}
|
2009-09-03 04:24:52 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
2007-08-28 04:49:44 +08:00
|
|
|
out:
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
|
2013-11-15 11:57:29 +08:00
|
|
|
trace_btrfs_get_extent(root, em);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 19:18:59 +08:00
|
|
|
|
2015-08-19 13:55:00 +08:00
|
|
|
btrfs_free_path(path);
|
2007-08-28 04:49:44 +08:00
|
|
|
if (trans) {
|
|
|
|
ret = btrfs_end_transaction(trans, root);
|
2009-01-06 10:25:51 +08:00
|
|
|
if (!err)
|
2007-08-28 04:49:44 +08:00
|
|
|
err = ret;
|
|
|
|
}
|
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(!em); /* Error is always set */
|
2007-08-28 04:49:44 +08:00
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2011-02-24 05:23:20 +08:00
|
|
|
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
|
|
|
|
size_t pg_offset, u64 start, u64 len,
|
|
|
|
int create)
|
|
|
|
{
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map *hole_em = NULL;
|
|
|
|
u64 range_start = start;
|
|
|
|
u64 end;
|
|
|
|
u64 found;
|
|
|
|
u64 found_end;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
|
|
|
|
if (IS_ERR(em))
|
|
|
|
return em;
|
|
|
|
if (em) {
|
|
|
|
/*
|
2013-01-07 18:10:12 +08:00
|
|
|
* if our em maps to
|
|
|
|
* - a hole or
|
|
|
|
* - a pre-alloc extent,
|
|
|
|
* there might actually be delalloc bytes behind it.
|
2011-02-24 05:23:20 +08:00
|
|
|
*/
|
2013-01-07 18:10:12 +08:00
|
|
|
if (em->block_start != EXTENT_MAP_HOLE &&
|
|
|
|
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
2011-02-24 05:23:20 +08:00
|
|
|
return em;
|
|
|
|
else
|
|
|
|
hole_em = em;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check to see if we've wrapped (len == -1 or similar) */
|
|
|
|
end = start + len;
|
|
|
|
if (end < start)
|
|
|
|
end = (u64)-1;
|
|
|
|
else
|
|
|
|
end -= 1;
|
|
|
|
|
|
|
|
em = NULL;
|
|
|
|
|
|
|
|
/* ok, we didn't find anything, lets look for delalloc */
|
|
|
|
found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
|
|
|
|
end, len, EXTENT_DELALLOC, 1);
|
|
|
|
found_end = range_start + found;
|
|
|
|
if (found_end < range_start)
|
|
|
|
found_end = (u64)-1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we didn't find anything useful, return
|
|
|
|
* the original results from get_extent()
|
|
|
|
*/
|
|
|
|
if (range_start > end || found_end <= start) {
|
|
|
|
em = hole_em;
|
|
|
|
hole_em = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* adjust the range_start to make sure it doesn't
|
|
|
|
* go backwards from the start they passed in
|
|
|
|
*/
|
2013-10-31 13:03:04 +08:00
|
|
|
range_start = max(start, range_start);
|
2011-02-24 05:23:20 +08:00
|
|
|
found = found_end - range_start;
|
|
|
|
|
|
|
|
if (found > 0) {
|
|
|
|
u64 hole_start = start;
|
|
|
|
u64 hole_len = len;
|
|
|
|
|
2011-04-21 06:48:27 +08:00
|
|
|
em = alloc_extent_map();
|
2011-02-24 05:23:20 +08:00
|
|
|
if (!em) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* when btrfs_get_extent can't find anything it
|
|
|
|
* returns one huge hole
|
|
|
|
*
|
|
|
|
* make sure what it found really fits our range, and
|
|
|
|
* adjust to make sure it is based on the start from
|
|
|
|
* the caller
|
|
|
|
*/
|
|
|
|
if (hole_em) {
|
|
|
|
u64 calc_end = extent_map_end(hole_em);
|
|
|
|
|
|
|
|
if (calc_end <= start || (hole_em->start > end)) {
|
|
|
|
free_extent_map(hole_em);
|
|
|
|
hole_em = NULL;
|
|
|
|
} else {
|
|
|
|
hole_start = max(hole_em->start, start);
|
|
|
|
hole_len = calc_end - hole_start;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
em->bdev = NULL;
|
|
|
|
if (hole_em && range_start > hole_start) {
|
|
|
|
/* our hole starts before our delalloc, so we
|
|
|
|
* have to return just the parts of the hole
|
|
|
|
* that go until the delalloc starts
|
|
|
|
*/
|
|
|
|
em->len = min(hole_len,
|
|
|
|
range_start - hole_start);
|
|
|
|
em->start = hole_start;
|
|
|
|
em->orig_start = hole_start;
|
|
|
|
/*
|
|
|
|
* don't adjust block start at all,
|
|
|
|
* it is fixed at EXTENT_MAP_HOLE
|
|
|
|
*/
|
|
|
|
em->block_start = hole_em->block_start;
|
|
|
|
em->block_len = hole_len;
|
2013-01-07 18:10:12 +08:00
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
|
|
|
|
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
|
2011-02-24 05:23:20 +08:00
|
|
|
} else {
|
|
|
|
em->start = range_start;
|
|
|
|
em->len = found;
|
|
|
|
em->orig_start = range_start;
|
|
|
|
em->block_start = EXTENT_MAP_DELALLOC;
|
|
|
|
em->block_len = found;
|
|
|
|
}
|
|
|
|
} else if (hole_em) {
|
|
|
|
return hole_em;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
|
|
|
|
free_extent_map(hole_em);
|
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
|
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2012-10-12 04:54:30 +08:00
|
|
|
struct extent_map *em;
|
2010-05-23 23:00:55 +08:00
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 alloc_hint;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, len);
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
alloc_hint, &ins, 1, 1);
|
2013-08-15 02:02:47 +08:00
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
2010-05-23 23:00:55 +08:00
|
|
|
|
2012-10-12 04:54:30 +08:00
|
|
|
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
|
2013-04-05 02:31:27 +08:00
|
|
|
ins.offset, ins.offset, ins.offset, 0);
|
2013-08-15 02:02:47 +08:00
|
|
|
if (IS_ERR(em)) {
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
|
2013-08-15 02:02:47 +08:00
|
|
|
return em;
|
|
|
|
}
|
2010-05-23 23:00:55 +08:00
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
|
|
|
|
ins.offset, ins.offset, 0);
|
|
|
|
if (ret) {
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
|
2013-08-15 02:02:47 +08:00
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(ret);
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
2013-08-15 02:02:47 +08:00
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
/*
|
|
|
|
* returns 1 when the nocow is safe, < 1 on error, 0 if the
|
|
|
|
* block must be cow'd
|
|
|
|
*/
|
2013-08-15 02:02:47 +08:00
|
|
|
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
2013-06-22 04:37:03 +08:00
|
|
|
u64 *orig_start, u64 *orig_block_len,
|
|
|
|
u64 *ram_bytes)
|
2010-05-26 23:04:10 +08:00
|
|
|
{
|
2013-08-15 02:02:47 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-26 23:04:10 +08:00
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2014-02-27 13:58:05 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-05-26 23:04:10 +08:00
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 backref_offset;
|
|
|
|
u64 extent_end;
|
|
|
|
u64 num_bytes;
|
|
|
|
int slot;
|
|
|
|
int found_type;
|
2013-06-22 04:37:03 +08:00
|
|
|
bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
|
2013-12-27 21:11:50 +08:00
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
|
2010-05-26 23:04:10 +08:00
|
|
|
offset, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
slot = path->slots[0];
|
|
|
|
if (ret == 1) {
|
|
|
|
if (slot == 0) {
|
|
|
|
/* can't find the item, must cow */
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
slot--;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
2011-04-20 10:31:50 +08:00
|
|
|
if (key.objectid != btrfs_ino(inode) ||
|
2010-05-26 23:04:10 +08:00
|
|
|
key.type != BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
/* not our file or wrong item type, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (key.offset > offset) {
|
|
|
|
/* Wrong offset, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
|
|
|
|
found_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
if (found_type != BTRFS_FILE_EXTENT_REG &&
|
|
|
|
found_type != BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
/* not a regular extent, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
2013-06-22 04:37:03 +08:00
|
|
|
|
|
|
|
if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
|
|
|
|
goto out;
|
|
|
|
|
2013-12-27 21:11:50 +08:00
|
|
|
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
|
|
|
|
if (extent_end <= offset)
|
|
|
|
goto out;
|
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
2013-06-22 04:37:03 +08:00
|
|
|
if (disk_bytenr == 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (btrfs_file_extent_compression(leaf, fi) ||
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) ||
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
goto out;
|
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
backref_offset = btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
2013-06-22 04:37:03 +08:00
|
|
|
if (orig_start) {
|
|
|
|
*orig_start = key.offset - backref_offset;
|
|
|
|
*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
|
|
|
|
*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
|
|
|
|
}
|
2013-04-25 04:32:55 +08:00
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
if (btrfs_extent_readonly(root, disk_bytenr))
|
|
|
|
goto out;
|
2014-02-27 13:58:05 +08:00
|
|
|
|
|
|
|
num_bytes = min(offset + *len, extent_end) - offset;
|
|
|
|
if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
u64 range_end;
|
|
|
|
|
|
|
|
range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
|
|
|
|
ret = test_range_bit(io_tree, offset, range_end,
|
|
|
|
EXTENT_DELALLOC, 0, NULL);
|
|
|
|
if (ret) {
|
|
|
|
ret = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-10-19 00:10:36 +08:00
|
|
|
btrfs_release_path(path);
|
2010-05-26 23:04:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* look for other files referencing this extent, if we
|
|
|
|
* find any we must cow
|
|
|
|
*/
|
2013-08-15 02:02:47 +08:00
|
|
|
trans = btrfs_join_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = 0;
|
2010-05-26 23:04:10 +08:00
|
|
|
goto out;
|
2013-08-15 02:02:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
|
|
|
|
key.offset - backref_offset, disk_bytenr);
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
if (ret) {
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2010-05-26 23:04:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* adjust disk_bytenr and num_bytes to cover just the bytes
|
|
|
|
* in this extent we are about to write. If there
|
|
|
|
* are any csums in that range we have to cow in order
|
|
|
|
* to keep the csums correct
|
|
|
|
*/
|
|
|
|
disk_bytenr += backref_offset;
|
|
|
|
disk_bytenr += offset - key.offset;
|
|
|
|
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
|
|
|
|
goto out;
|
|
|
|
/*
|
|
|
|
* all of the above have passed, it is safe to overwrite this extent
|
|
|
|
* without cow
|
|
|
|
*/
|
2013-04-25 04:32:55 +08:00
|
|
|
*len = num_bytes;
|
2010-05-26 23:04:10 +08:00
|
|
|
ret = 1;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-05-21 04:07:56 +08:00
|
|
|
bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
|
|
|
|
{
|
|
|
|
struct radix_tree_root *root = &inode->i_mapping->page_tree;
|
|
|
|
int found = false;
|
|
|
|
void **pagep = NULL;
|
|
|
|
struct page *page = NULL;
|
|
|
|
int start_idx;
|
|
|
|
int end_idx;
|
|
|
|
|
|
|
|
start_idx = start >> PAGE_CACHE_SHIFT;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* end is the last byte in the last page. end == start is legal
|
|
|
|
*/
|
|
|
|
end_idx = end >> PAGE_CACHE_SHIFT;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
/* Most of the code in this while loop is lifted from
|
|
|
|
* find_get_page. It's been modified to begin searching from a
|
|
|
|
* page and return just the first page found in that range. If the
|
|
|
|
* found idx is less than or equal to the end idx then we know that
|
|
|
|
* a page exists. If no pages are found or if those pages are
|
|
|
|
* outside of the range then we're fine (yay!) */
|
|
|
|
while (page == NULL &&
|
|
|
|
radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
|
|
|
|
page = radix_tree_deref_slot(pagep);
|
|
|
|
if (unlikely(!page))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (radix_tree_exception(page)) {
|
2014-06-05 20:22:25 +08:00
|
|
|
if (radix_tree_deref_retry(page)) {
|
|
|
|
page = NULL;
|
2014-05-21 04:07:56 +08:00
|
|
|
continue;
|
2014-06-05 20:22:25 +08:00
|
|
|
}
|
2014-05-21 04:07:56 +08:00
|
|
|
/*
|
|
|
|
* Otherwise, shmem/tmpfs must be storing a swap entry
|
|
|
|
* here as an exceptional entry: so return it without
|
|
|
|
* attempting to raise page count.
|
|
|
|
*/
|
2014-06-05 20:22:26 +08:00
|
|
|
page = NULL;
|
2014-05-21 04:07:56 +08:00
|
|
|
break; /* TODO: Is this relevant for this use case? */
|
|
|
|
}
|
|
|
|
|
2014-06-05 20:22:24 +08:00
|
|
|
if (!page_cache_get_speculative(page)) {
|
|
|
|
page = NULL;
|
2014-05-21 04:07:56 +08:00
|
|
|
continue;
|
2014-06-05 20:22:24 +08:00
|
|
|
}
|
2014-05-21 04:07:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Has the page moved?
|
|
|
|
* This is part of the lockless pagecache protocol. See
|
|
|
|
* include/linux/pagemap.h for details.
|
|
|
|
*/
|
|
|
|
if (unlikely(page != *pagep)) {
|
|
|
|
page_cache_release(page);
|
|
|
|
page = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (page) {
|
|
|
|
if (page->index <= end_idx)
|
|
|
|
found = true;
|
|
|
|
page_cache_release(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
2012-08-01 04:28:48 +08:00
|
|
|
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
|
|
|
|
struct extent_state **cached_state, int writing)
|
|
|
|
{
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
2015-12-03 21:30:40 +08:00
|
|
|
cached_state);
|
2012-08-01 04:28:48 +08:00
|
|
|
/*
|
|
|
|
* We're concerned with the entire range that we're going to be
|
|
|
|
* doing DIO to, so we need to make sure theres no ordered
|
|
|
|
* extents in this range.
|
|
|
|
*/
|
|
|
|
ordered = btrfs_lookup_ordered_range(inode, lockstart,
|
|
|
|
lockend - lockstart + 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to make sure there are no buffered pages in this
|
|
|
|
* range either, we could have raced between the invalidate in
|
|
|
|
* generic_file_direct_write and locking the extent. The
|
|
|
|
* invalidate needs to happen so that reads after a write do not
|
|
|
|
* get stale data.
|
|
|
|
*/
|
2014-05-21 04:07:56 +08:00
|
|
|
if (!ordered &&
|
|
|
|
(!writing ||
|
|
|
|
!btrfs_page_exists_in_range(inode, lockstart, lockend)))
|
2012-08-01 04:28:48 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
cached_state, GFP_NOFS);
|
|
|
|
|
|
|
|
if (ordered) {
|
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
} else {
|
|
|
|
/*
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-09 00:23:16 +08:00
|
|
|
* We could trigger writeback for this range (and wait
|
|
|
|
* for it to complete) and then invalidate the pages for
|
|
|
|
* this range (through invalidate_inode_pages2_range()),
|
|
|
|
* but that can lead us to a deadlock with a concurrent
|
|
|
|
* call to readpages() (a buffered read or a defrag call
|
|
|
|
* triggered a readahead) on a page lock due to an
|
|
|
|
* ordered dio extent we created before but did not have
|
|
|
|
* yet a corresponding bio submitted (whence it can not
|
|
|
|
* complete), which makes readpages() wait for that
|
|
|
|
* ordered extent to complete while holding a lock on
|
|
|
|
* that page.
|
2012-08-01 04:28:48 +08:00
|
|
|
*/
|
Btrfs: fix deadlock between direct IO write and defrag/readpages
If readpages() (triggered by defrag or buffered reads) is called while a
direct IO write is in progress, we have a small time window where we can
deadlock, resulting in traces like the following being generated:
[84723.212993] INFO: task fio:2849 blocked for more than 120 seconds.
[84723.214310] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.215640] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.217313] fio D ffff88023ec75218 0 2849 2835 0x00000000
[84723.218778] ffff880122dfb6e8 0000000000000092 0000000000000000 ffff88023ec75200
[84723.220458] ffff88000e05d2c0 ffff880122dfc000 ffff88023ec75200 7fffffffffffffff
[84723.230597] 0000000000000002 ffffffff8147891a ffff880122dfb700 ffffffff8147856a
[84723.232085] Call Trace:
[84723.232625] [<ffffffff8147891a>] ? bit_wait+0x3c/0x3c
[84723.233529] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.234398] [<ffffffff8147baa3>] schedule_timeout+0x43/0x10b
[84723.235384] [<ffffffff810f82eb>] ? time_hardirqs_on+0x15/0x28
[84723.236426] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.237502] [<ffffffff810af8a3>] ? read_seqcount_begin.constprop.20+0x57/0x6d
[84723.238807] [<ffffffff8108a09b>] ? trace_hardirqs_on_caller+0x16/0x1ab
[84723.242012] [<ffffffff8108a23d>] ? trace_hardirqs_on+0xd/0xf
[84723.243064] [<ffffffff810af2ad>] ? timekeeping_get_ns+0xe/0x33
[84723.244116] [<ffffffff810afa2e>] ? ktime_get+0x41/0x52
[84723.245029] [<ffffffff81477cff>] io_schedule_timeout+0xb7/0x12b
[84723.245942] [<ffffffff81477cff>] ? io_schedule_timeout+0xb7/0x12b
[84723.246596] [<ffffffff81478953>] bit_wait_io+0x39/0x45
[84723.247503] [<ffffffff81478b93>] __wait_on_bit_lock+0x49/0x8d
[84723.248540] [<ffffffff8111684f>] __lock_page+0x66/0x68
[84723.249558] [<ffffffff81081c9b>] ? autoremove_wake_function+0x3a/0x3a
[84723.250844] [<ffffffff81124a04>] lock_page+0x2c/0x2f
[84723.251871] [<ffffffff81124afc>] invalidate_inode_pages2_range+0xf5/0x2aa
[84723.253274] [<ffffffff81117c34>] ? filemap_fdatawait_range+0x12d/0x146
[84723.254757] [<ffffffff81118191>] ? filemap_fdatawrite_range+0x13/0x15
[84723.256378] [<ffffffffa05139a2>] btrfs_get_blocks_direct+0x1b0/0x664 [btrfs]
[84723.258556] [<ffffffff8119e3f9>] ? submit_page_section+0x7b/0x111
[84723.260064] [<ffffffff8119eb90>] do_blockdev_direct_IO+0x658/0xbdb
[84723.261479] [<ffffffffa05137f2>] ? btrfs_page_exists_in_range+0x1a9/0x1a9 [btrfs]
[84723.262961] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.264449] [<ffffffff8119f144>] __blockdev_direct_IO+0x31/0x33
[84723.265614] [<ffffffff8119f144>] ? __blockdev_direct_IO+0x31/0x33
[84723.266769] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.268264] [<ffffffffa050935d>] btrfs_direct_IO+0x1b9/0x259 [btrfs]
[84723.270954] [<ffffffffa050a8a6>] ? btrfs_writepage_start_hook+0xce/0xce [btrfs]
[84723.272465] [<ffffffff8111878c>] generic_file_direct_write+0xb3/0x128
[84723.273734] [<ffffffffa051955c>] btrfs_file_write_iter+0x228/0x404 [btrfs]
[84723.275101] [<ffffffff8116ca6f>] __vfs_write+0x7c/0xa5
[84723.276200] [<ffffffff8116cfab>] vfs_write+0xa0/0xe4
[84723.277298] [<ffffffff8116d79d>] SyS_write+0x50/0x7e
[84723.278327] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
[84723.279595] INFO: lockdep is turned off.
[84723.379035] INFO: task btrfs:2923 blocked for more than 120 seconds.
[84723.380323] Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[84723.381608] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[84723.383003] btrfs D ffff88023ed75218 0 2923 2859 0x00000000
[84723.384277] ffff88001311f860 0000000000000082 ffff88001311f840 ffff88023ed75200
[84723.385748] ffff88012c6751c0 ffff880013120000 ffff88012042fe68 ffff88012042fe30
[84723.387152] ffff880221571c88 0000000000000001 ffff88001311f878 ffffffff8147856a
[84723.388620] Call Trace:
[84723.389105] [<ffffffff8147856a>] schedule+0x7d/0x95
[84723.391882] [<ffffffffa051da32>] btrfs_start_ordered_extent+0x161/0x1fa [btrfs]
[84723.393718] [<ffffffff81081c61>] ? signal_pending_state+0x31/0x31
[84723.395659] [<ffffffffa0522c5b>] __do_contiguous_readpages.constprop.21+0x81/0xdc [btrfs]
[84723.397383] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.398852] [<ffffffffa0522da3>] __extent_readpages.constprop.20+0xed/0x100 [btrfs]
[84723.400561] [<ffffffff81123f6c>] ? __lru_cache_add+0x5d/0x72
[84723.401787] [<ffffffffa0523896>] extent_readpages+0x111/0x1a7 [btrfs]
[84723.403121] [<ffffffffa050ac96>] ? btrfs_submit_direct+0x3f0/0x3f0 [btrfs]
[84723.404583] [<ffffffffa05088fa>] btrfs_readpages+0x1f/0x21 [btrfs]
[84723.406007] [<ffffffff811226df>] __do_page_cache_readahead+0x168/0x1f4
[84723.407502] [<ffffffff81122988>] ondemand_readahead+0x21d/0x22e
[84723.408937] [<ffffffff81122988>] ? ondemand_readahead+0x21d/0x22e
[84723.410487] [<ffffffff81122af1>] page_cache_sync_readahead+0x3d/0x3f
[84723.411710] [<ffffffffa0535388>] btrfs_defrag_file+0x419/0xaaf [btrfs]
[84723.413007] [<ffffffffa0531db0>] ? kzalloc+0xf/0x11 [btrfs]
[84723.414085] [<ffffffffa0535b43>] btrfs_ioctl_defrag+0x125/0x14e [btrfs]
[84723.415307] [<ffffffffa0536753>] btrfs_ioctl+0x746/0x24c6 [btrfs]
[84723.416532] [<ffffffff81087481>] ? arch_local_irq_save+0x9/0xc
[84723.417731] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.418699] [<ffffffff8113ad61>] ? __might_fault+0x4c/0xa7
[84723.421532] [<ffffffff8113adba>] ? __might_fault+0xa5/0xa7
[84723.422629] [<ffffffff81171139>] ? cp_new_stat+0x15d/0x174
[84723.423712] [<ffffffff8117c610>] do_vfs_ioctl+0x427/0x4e6
[84723.424801] [<ffffffff81171175>] ? SYSC_newfstat+0x25/0x2e
[84723.425968] [<ffffffff8118574d>] ? __fget_light+0x4d/0x71
[84723.427063] [<ffffffff8117c726>] SyS_ioctl+0x57/0x79
[84723.428138] [<ffffffff8147cd97>] entry_SYSCALL_64_fastpath+0x12/0x6f
Consider the following logical and physical file layout:
logical: ... [ prealloc extent A ] [ prealloc extent B ] [ extent C ] ...
4K 8K 16K
physical: ... 12853248 12857344 1103101952 ...
(= 12853248 + 4K)
Extents A and B are physically adjacent. The following diagram shows a
sequence of events that lead to the deadlock when we attempt to do a
direct IO write against the file range [4K, 16K[ and a defrag is triggered
simultaneously.
CPU 1 CPU 2
btrfs_direct_IO()
btrfs_get_blocks_direct()
creates ordered extent A, covering
the 4k prealloc extent A (range [4K, 8K[)
btrfs_defrag_file()
page_cache_sync_readahead([0K, 1M[)
btrfs_readpages()
extent_readpages()
locks all pages in the file
range [0K, 128K[ through calls
to add_to_page_cache_lru()
__do_contiguous_readpages()
finds ordered extent A
waits for it to complete
btrfs_get_blocks_direct() called again
lock_extent_direct(range [8K, 16K[)
finds a page in range [8K, 16K[ through
btrfs_page_exists_in_range()
invalidate_inode_pages2_range([8K, 16K[)
--> tries to lock pages that are already
locked by the task at CPU 2
--> our task, running __blockdev_direct_IO(),
hangs waiting to lock the pages and the
submit bio callback, btrfs_submit_direct(),
ends up never being called, resulting in the
ordered extent A never completing (because a
corresponding bio is never submitted) and
CPU 2 will wait for it forever while holding
the pages locked
---> deadlock!
Fix this by removing the page invalidation approach when attempting to
lock the range for IO from the callback btrfs_get_blocks_direct() and
falling back buffered IO. This was a rare case anyway and well behaved
applications do not mix concurrent direct IO writes with buffered reads
anyway, being a concurrent defrag the only normal case that could lead
to the deadlock.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-12-09 00:23:16 +08:00
|
|
|
ret = -ENOTBLK;
|
|
|
|
break;
|
2012-08-01 04:28:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-09-12 03:40:07 +08:00
|
|
|
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
|
|
|
|
u64 len, u64 orig_start,
|
|
|
|
u64 block_start, u64 block_len,
|
2013-04-05 02:31:27 +08:00
|
|
|
u64 orig_block_len, u64 ram_bytes,
|
|
|
|
int type)
|
2012-09-12 03:40:07 +08:00
|
|
|
{
|
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
em = alloc_extent_map();
|
|
|
|
if (!em)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
em->start = start;
|
|
|
|
em->orig_start = orig_start;
|
2012-10-13 03:27:49 +08:00
|
|
|
em->mod_start = start;
|
|
|
|
em->mod_len = len;
|
2012-09-12 03:40:07 +08:00
|
|
|
em->len = len;
|
|
|
|
em->block_len = block_len;
|
|
|
|
em->block_start = block_start;
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2012-12-03 23:31:19 +08:00
|
|
|
em->orig_block_len = orig_block_len;
|
2013-04-05 02:31:27 +08:00
|
|
|
em->ram_bytes = ram_bytes;
|
2012-10-12 04:54:30 +08:00
|
|
|
em->generation = -1;
|
2012-09-12 03:40:07 +08:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
if (type == BTRFS_ORDERED_PREALLOC)
|
2012-12-03 23:58:15 +08:00
|
|
|
set_bit(EXTENT_FLAG_FILLING, &em->flags);
|
2012-09-12 03:40:07 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
btrfs_drop_extent_cache(inode, em->start,
|
|
|
|
em->start + em->len - 1, 0);
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
2012-09-12 03:40:07 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
} while (ret == -EEXIST);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 17:52:04 +08:00
|
|
|
static void adjust_dio_outstanding_extents(struct inode *inode,
|
|
|
|
struct btrfs_dio_data *dio_data,
|
|
|
|
const u64 len)
|
|
|
|
{
|
|
|
|
unsigned num_extents;
|
|
|
|
|
|
|
|
num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
|
|
|
/*
|
|
|
|
* If we have an outstanding_extents count still set then we're
|
|
|
|
* within our reservation, otherwise we need to adjust our inode
|
|
|
|
* counter appropriately.
|
|
|
|
*/
|
|
|
|
if (dio_data->outstanding_extents) {
|
|
|
|
dio_data->outstanding_extents -= num_extents;
|
|
|
|
} else {
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
BTRFS_I(inode)->outstanding_extents += num_extents;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2012-08-01 04:28:48 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
2015-08-28 23:40:13 +08:00
|
|
|
struct btrfs_dio_data *dio_data = NULL;
|
2010-05-23 23:00:55 +08:00
|
|
|
u64 start = iblock << inode->i_blkbits;
|
2012-08-01 04:28:48 +08:00
|
|
|
u64 lockstart, lockend;
|
2010-05-23 23:00:55 +08:00
|
|
|
u64 len = bh_result->b_size;
|
2012-08-01 04:28:48 +08:00
|
|
|
int unlock_bits = EXTENT_LOCKED;
|
2013-02-07 18:12:07 +08:00
|
|
|
int ret = 0;
|
2012-08-01 04:28:48 +08:00
|
|
|
|
Btrfs: fix wrong outstanding_extents when doing DIO write
When running the 083th case of xfstests on the filesystem with
"compress-force=lzo", the following WARNINGs were triggered.
WARNING: at fs/btrfs/inode.c:7908
WARNING: at fs/btrfs/inode.c:7909
WARNING: at fs/btrfs/inode.c:7911
WARNING: at fs/btrfs/extent-tree.c:4510
WARNING: at fs/btrfs/extent-tree.c:4511
This problem was introduced by the patch "Btrfs: fix deadlock due
to unsubmitted". In this patch, there are two bugs which caused
the above problem.
The 1st one is a off-by-one bug, if the DIO write return 0, it is
also a short write, we need release the reserved space for it. But
we didn't do it in that patch. Fix it by change "ret > 0" to
"ret >= 0".
The 2nd one is ->outstanding_extents was increased twice when
a short write happened. As we know, ->outstanding_extents is
a counter to keep track of the number of extent items we may
use duo to delalloc, when we reserve the free space for a
delalloc write, we assume that the write will introduce just
one extent item, so we increase ->outstanding_extents by 1 at
that time. And then we will increase it every time we split the
write, it is done at the beginning of btrfs_get_blocks_direct().
So when a short write happens, we needn't increase
->outstanding_extents again. But this patch done.
In order to fix the 2nd problem, I re-write the logic for
->outstanding_extents operation. We don't increase it at the
beginning of btrfs_get_blocks_direct(), instead, we just
increase it when the split actually happens.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-02-21 17:48:22 +08:00
|
|
|
if (create)
|
2015-02-12 04:08:58 +08:00
|
|
|
unlock_bits |= EXTENT_DIRTY;
|
Btrfs: fix wrong outstanding_extents when doing DIO write
When running the 083th case of xfstests on the filesystem with
"compress-force=lzo", the following WARNINGs were triggered.
WARNING: at fs/btrfs/inode.c:7908
WARNING: at fs/btrfs/inode.c:7909
WARNING: at fs/btrfs/inode.c:7911
WARNING: at fs/btrfs/extent-tree.c:4510
WARNING: at fs/btrfs/extent-tree.c:4511
This problem was introduced by the patch "Btrfs: fix deadlock due
to unsubmitted". In this patch, there are two bugs which caused
the above problem.
The 1st one is a off-by-one bug, if the DIO write return 0, it is
also a short write, we need release the reserved space for it. But
we didn't do it in that patch. Fix it by change "ret > 0" to
"ret >= 0".
The 2nd one is ->outstanding_extents was increased twice when
a short write happened. As we know, ->outstanding_extents is
a counter to keep track of the number of extent items we may
use duo to delalloc, when we reserve the free space for a
delalloc write, we assume that the write will introduce just
one extent item, so we increase ->outstanding_extents by 1 at
that time. And then we will increase it every time we split the
write, it is done at the beginning of btrfs_get_blocks_direct().
So when a short write happens, we needn't increase
->outstanding_extents again. But this patch done.
In order to fix the 2nd problem, I re-write the logic for
->outstanding_extents operation. We don't increase it at the
beginning of btrfs_get_blocks_direct(), instead, we just
increase it when the split actually happens.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-02-21 17:48:22 +08:00
|
|
|
else
|
2012-08-04 04:49:19 +08:00
|
|
|
len = min_t(u64, len, root->sectorsize);
|
2012-08-01 04:28:48 +08:00
|
|
|
|
2012-08-04 04:49:19 +08:00
|
|
|
lockstart = start;
|
|
|
|
lockend = start + len - 1;
|
|
|
|
|
2015-03-17 22:52:28 +08:00
|
|
|
if (current->journal_info) {
|
|
|
|
/*
|
|
|
|
* Need to pull our outstanding extents and set journal_info to NULL so
|
|
|
|
* that anything that needs to check if there's a transction doesn't get
|
|
|
|
* confused.
|
|
|
|
*/
|
2015-08-28 23:40:13 +08:00
|
|
|
dio_data = current->journal_info;
|
2015-03-17 22:52:28 +08:00
|
|
|
current->journal_info = NULL;
|
|
|
|
}
|
|
|
|
|
2012-08-01 04:28:48 +08:00
|
|
|
/*
|
|
|
|
* If this errors out it's because we couldn't invalidate pagecache for
|
|
|
|
* this range and we need to fallback to buffered.
|
|
|
|
*/
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 17:52:04 +08:00
|
|
|
if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
|
|
|
|
create)) {
|
|
|
|
ret = -ENOTBLK;
|
|
|
|
goto err;
|
|
|
|
}
|
2012-08-01 04:28:48 +08:00
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
|
2012-08-01 04:28:48 +08:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
goto unlock_err;
|
|
|
|
}
|
2010-05-23 23:00:55 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
|
|
|
|
* io. INLINE is special, and we could probably kludge it in here, but
|
|
|
|
* it's still buffered so for safety lets just fall back to the generic
|
|
|
|
* buffered path.
|
|
|
|
*
|
|
|
|
* For COMPRESSED we _have_ to read the entire extent in so we can
|
|
|
|
* decompress it, so there will be buffering required no matter what we
|
|
|
|
* do, so go ahead and fallback to buffered.
|
|
|
|
*
|
|
|
|
* We return -ENOTBLK because thats what makes DIO go ahead and go back
|
|
|
|
* to buffered IO. Don't blame me, this is the price we pay for using
|
|
|
|
* the generic code.
|
|
|
|
*/
|
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
|
|
|
|
em->block_start == EXTENT_MAP_INLINE) {
|
|
|
|
free_extent_map(em);
|
2012-08-01 04:28:48 +08:00
|
|
|
ret = -ENOTBLK;
|
|
|
|
goto unlock_err;
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Just a good old fashioned hole, return */
|
|
|
|
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
|
|
|
|
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
|
|
|
|
free_extent_map(em);
|
2012-08-01 04:28:48 +08:00
|
|
|
goto unlock_err;
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't allocate a new extent in the following cases
|
|
|
|
*
|
|
|
|
* 1) The inode is marked as NODATACOW. In this case we'll just use the
|
|
|
|
* existing extent.
|
|
|
|
* 2) The extent is marked as PREALLOC. We're good to go here and can
|
|
|
|
* just use the extent.
|
|
|
|
*
|
|
|
|
*/
|
2010-05-26 23:04:10 +08:00
|
|
|
if (!create) {
|
2012-08-01 04:28:48 +08:00
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
lockstart = start + len;
|
|
|
|
goto unlock;
|
2010-05-26 23:04:10 +08:00
|
|
|
}
|
2010-05-23 23:00:55 +08:00
|
|
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
|
|
|
|
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
em->block_start != EXTENT_MAP_HOLE)) {
|
|
|
|
int type;
|
2013-04-25 04:32:55 +08:00
|
|
|
u64 block_start, orig_start, orig_block_len, ram_bytes;
|
2010-05-23 23:00:55 +08:00
|
|
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
else
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
2010-05-26 23:04:10 +08:00
|
|
|
len = min(len, em->len - (start - em->start));
|
2010-05-23 23:00:55 +08:00
|
|
|
block_start = em->block_start + (start - em->start);
|
2010-05-26 23:04:10 +08:00
|
|
|
|
2013-08-15 02:02:47 +08:00
|
|
|
if (can_nocow_extent(inode, start, &len, &orig_start,
|
2013-06-22 04:37:03 +08:00
|
|
|
&orig_block_len, &ram_bytes) == 1) {
|
2012-09-12 03:40:07 +08:00
|
|
|
if (type == BTRFS_ORDERED_PREALLOC) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = create_pinned_em(inode, start, len,
|
|
|
|
orig_start,
|
2012-12-03 23:31:19 +08:00
|
|
|
block_start, len,
|
2013-04-05 02:31:27 +08:00
|
|
|
orig_block_len,
|
|
|
|
ram_bytes, type);
|
2014-07-07 19:35:21 +08:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
ret = PTR_ERR(em);
|
2012-09-12 03:40:07 +08:00
|
|
|
goto unlock_err;
|
2014-07-07 19:35:21 +08:00
|
|
|
}
|
2012-09-12 03:40:07 +08:00
|
|
|
}
|
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
ret = btrfs_add_ordered_extent_dio(inode, start,
|
|
|
|
block_start, len, len, type);
|
|
|
|
if (ret) {
|
|
|
|
free_extent_map(em);
|
2012-08-01 04:28:48 +08:00
|
|
|
goto unlock_err;
|
2010-05-26 23:04:10 +08:00
|
|
|
}
|
|
|
|
goto unlock;
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
}
|
2013-08-15 02:02:47 +08:00
|
|
|
|
2010-05-26 23:04:10 +08:00
|
|
|
/*
|
|
|
|
* this will cow the extent, reset the len in case we changed
|
|
|
|
* it above
|
|
|
|
*/
|
|
|
|
len = bh_result->b_size;
|
2012-10-12 04:54:30 +08:00
|
|
|
free_extent_map(em);
|
|
|
|
em = btrfs_new_extent_direct(inode, start, len);
|
2012-08-01 04:28:48 +08:00
|
|
|
if (IS_ERR(em)) {
|
|
|
|
ret = PTR_ERR(em);
|
|
|
|
goto unlock_err;
|
|
|
|
}
|
2010-05-26 23:04:10 +08:00
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
unlock:
|
2010-05-23 23:00:55 +08:00
|
|
|
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
|
|
|
|
inode->i_blkbits;
|
2010-05-26 23:04:10 +08:00
|
|
|
bh_result->b_size = len;
|
2010-05-23 23:00:55 +08:00
|
|
|
bh_result->b_bdev = em->bdev;
|
|
|
|
set_buffer_mapped(bh_result);
|
2012-06-19 22:59:00 +08:00
|
|
|
if (create) {
|
|
|
|
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Need to update the i_size under the extent lock so buffered
|
|
|
|
* readers will get the updated i_size when we unlock.
|
|
|
|
*/
|
|
|
|
if (start + len > i_size_read(inode))
|
|
|
|
i_size_write(inode, start + len);
|
2013-02-07 18:12:07 +08:00
|
|
|
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 17:52:04 +08:00
|
|
|
adjust_dio_outstanding_extents(inode, dio_data, len);
|
2015-09-08 17:25:55 +08:00
|
|
|
btrfs_free_reserved_data_space(inode, start, len);
|
2015-08-28 23:40:13 +08:00
|
|
|
WARN_ON(dio_data->reserve < len);
|
|
|
|
dio_data->reserve -= len;
|
2015-12-09 03:23:20 +08:00
|
|
|
dio_data->unsubmitted_oe_range_end = start + len;
|
2015-08-28 23:40:13 +08:00
|
|
|
current->journal_info = dio_data;
|
2012-06-19 22:59:00 +08:00
|
|
|
}
|
2010-05-23 23:00:55 +08:00
|
|
|
|
2012-08-01 04:28:48 +08:00
|
|
|
/*
|
|
|
|
* In the case of write we need to clear and unlock the entire range,
|
|
|
|
* in the case of read we need to unlock only the end area that we
|
|
|
|
* aren't using if there is any left over space.
|
|
|
|
*/
|
2012-08-23 10:10:38 +08:00
|
|
|
if (lockstart < lockend) {
|
2013-02-07 18:12:07 +08:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
|
|
|
|
lockend, unlock_bits, 1, 0,
|
|
|
|
&cached_state, GFP_NOFS);
|
2012-08-23 10:10:38 +08:00
|
|
|
} else {
|
2012-08-01 04:28:48 +08:00
|
|
|
free_extent_state(cached_state);
|
2012-08-23 10:10:38 +08:00
|
|
|
}
|
2012-08-01 04:28:48 +08:00
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
return 0;
|
2012-08-01 04:28:48 +08:00
|
|
|
|
|
|
|
unlock_err:
|
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 17:52:04 +08:00
|
|
|
err:
|
2015-08-28 23:40:13 +08:00
|
|
|
if (dio_data)
|
|
|
|
current->journal_info = dio_data;
|
Btrfs: fix extent accounting for partial direct IO writes
When doing a write using direct IO we can end up not doing the whole write
operation using the direct IO path, in that case we fallback to a buffered
write to do the remaining IO. This happens for example if the range we are
writing to contains a compressed extent.
When we do a partial write and fallback to buffered IO, due to the
existence of a compressed extent for example, we end up not adjusting the
outstanding extents counter of our inode which ends up getting decremented
twice, once by the DIO ordered extent for the partial write and once again
by btrfs_direct_IO(), resulting in an arithmetic underflow at
extent-tree.c:drop_outstanding_extent(). For example if we have:
extents [ prealloc extent ] [ compressed extent ]
offsets A B C D E
and at the moment our inode's outstanding extents counter is 0, if we do a
direct IO write against the range [B, D[ (which has a length smaller than
128Mb), we end up bumping our inode's outstanding extents counter to 1, we
create a DIO ordered extent for the range [B, C[ and then fallback to a
buffered write for the range [C, D[. The direct IO handler
(inode.c:btrfs_direct_IO()) decrements the outstanding extents counter by
1, leaving it with a value of 0, through a call to
btrfs_delalloc_release_space() and then shortly after the DIO ordered
extent finishes and calls btrfs_delalloc_release_metadata() which ends
up to attempt to decrement the inode's outstanding extents counter by 1,
resulting in an assertion failure at drop_outstanding_extent() because
the operation would result in an arithmetic underflow (0 - 1). This
produces the following trace:
[125471.336838] BTRFS: assertion failed: BTRFS_I(inode)->outstanding_extents >= num_extents, file: fs/btrfs/extent-tree.c, line: 5526
[125471.338844] ------------[ cut here ]------------
[125471.340745] kernel BUG at fs/btrfs/ctree.h:4173!
[125471.340745] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[125471.340745] Modules linked in: btrfs f2fs xfs libcrc32c dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc acpi_cpufreq psmouse i2c_piix4 parport pcspkr serio_raw microcode processor evdev i2c_core button ext4 crc16 jbd2 mbcache sd_mod sg sr_mod cdrom ata_generic virtio_scsi ata_piix virtio_pci virtio_ring floppy libata virtio e1000 scsi_mod [last unloaded: btrfs]
[125471.340745] CPU: 10 PID: 23649 Comm: kworker/u32:1 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1
[125471.340745] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[125471.340745] Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
[125471.340745] task: ffff8804244fcf80 ti: ffff88040a118000 task.ti: ffff88040a118000
[125471.340745] RIP: 0010:[<ffffffffa0550da1>] [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP: 0018:ffff88040a11bc78 EFLAGS: 00010296
[125471.340745] RAX: 0000000000000075 RBX: 0000000000005000 RCX: 0000000000000000
[125471.340745] RDX: ffffffff81098f93 RSI: ffffffff8147c619 RDI: 00000000ffffffff
[125471.340745] RBP: ffff88040a11bc78 R08: 0000000000000001 R09: 0000000000000000
[125471.340745] R10: ffff88040a11bc08 R11: ffffffff81651000 R12: ffff8803efb4a000
[125471.340745] R13: ffff8803efb4a000 R14: 0000000000000000 R15: ffff8802f8e33c88
[125471.340745] FS: 0000000000000000(0000) GS:ffff88043dd40000(0000) knlGS:0000000000000000
[125471.340745] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[125471.340745] CR2: 00007fae7ca86095 CR3: 0000000001a0b000 CR4: 00000000000006e0
[125471.340745] Stack:
[125471.340745] ffff88040a11bc88 ffffffffa04ca0cd ffff88040a11bcc8 ffffffffa04ceeb1
[125471.340745] ffff8802f8e33940 ffff8802c93eadb0 ffff8802f8e0bf50 ffff8803efb4a000
[125471.340745] 0000000000000000 ffff8802f8e33c88 ffff88040a11bd38 ffffffffa04eccfa
[125471.340745] Call Trace:
[125471.340745] [<ffffffffa04ca0cd>] drop_outstanding_extent+0x3d/0x6d [btrfs]
[125471.340745] [<ffffffffa04ceeb1>] btrfs_delalloc_release_metadata+0x51/0xdd [btrfs]
[125471.340745] [<ffffffffa04eccfa>] btrfs_finish_ordered_io+0x420/0x4eb [btrfs]
[125471.340745] [<ffffffffa04ecdda>] finish_ordered_fn+0x15/0x17 [btrfs]
[125471.340745] [<ffffffffa050e6e8>] normal_work_helper+0x14c/0x32a [btrfs]
[125471.340745] [<ffffffffa050e9c8>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
[125471.340745] [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
[125471.340745] [<ffffffff81064285>] worker_thread+0x206/0x2c2
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
[125471.340745] [<ffffffff8106904d>] kthread+0xef/0xf7
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
[125471.340745] [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
[125471.340745] Code: a5 55 a0 48 89 e5 e8 42 50 bc e0 0f 0b 55 89 f1 48 c7 c2 f0 a8 55 a0 48 89 fe 31 c0 48 c7 c7 14 aa 55 a0 48 89 e5 e8 22 50 bc e0 <0f> 0b 0f 1f 44 00 00 55 31 c9 ba 18 00 00 00 48 89 e5 41 56 41
[125471.340745] RIP [<ffffffffa0550da1>] assfail.constprop.46+0x1e/0x20 [btrfs]
[125471.340745] RSP <ffff88040a11bc78>
[125471.539620] ---[ end trace 144259f7838b4aa4 ]---
So fix this by ensuring we adjust the outstanding extents counter when we
do the fallback just like we do for the case where the whole write can be
done through the direct IO path.
We were also adjusting the outstanding extents counter by a constant value
of 1, which is incorrect because we were ignorning that we account extents
in BTRFS_MAX_EXTENT_SIZE units, o fix that as well.
The following test case for fstests reproduces this issue:
seq=`basename $0`
seqres=$RESULT_DIR/$seq
echo "QA output created by $seq"
tmp=/tmp/$$
status=1 # failure is the default!
trap "_cleanup; exit \$status" 0 1 2 3 15
_cleanup()
{
rm -f $tmp.*
}
# get standard environment, filters and checks
. ./common/rc
. ./common/filter
# real QA test starts here
_need_to_be_root
_supported_fs btrfs
_supported_os Linux
_require_scratch
_require_xfs_io_command "falloc"
rm -f $seqres.full
_scratch_mkfs >>$seqres.full 2>&1
_scratch_mount "-o compress"
# Create a compressed extent covering the range [700K, 800K[.
$XFS_IO_PROG -f -s -c "pwrite -S 0xaa -b 100K 700K 100K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Create prealloc extent covering the range [600K, 700K[.
$XFS_IO_PROG -c "falloc 600K 100K" $SCRATCH_MNT/foo
# Write 80K of data to the range [640K, 720K[ using direct IO. This
# range covers both the prealloc extent and the compressed extent.
# Because there's a compressed extent in the range we are writing to,
# the DIO write code path ends up only writing the first 60k of data,
# which goes to the prealloc extent, and then falls back to buffered IO
# for writing the remaining 20K of data - because that remaining data
# maps to a file range containing a compressed extent.
# When falling back to buffered IO, we used to trigger an assertion when
# releasing reserved space due to bad accounting of the inode's
# outstanding extents counter, which was set to 1 but we ended up
# decrementing it by 1 twice, once through the ordered extent for the
# 60K of data we wrote using direct IO, and once through the main direct
# IO handler (inode.cbtrfs_direct_IO()) because the direct IO write
# wrote less than 80K of data (60K).
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 80K 640K 80K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Now similar test as above but for very large write operations. This
# triggers special cases for an inode's outstanding extents accounting,
# as internally btrfs logically splits extents into 128Mb units.
$XFS_IO_PROG -f -s \
-c "pwrite -S 0xaa -b 128M 258M 128M" \
-c "falloc 0 258M" \
$SCRATCH_MNT/bar | _filter_xfs_io
$XFS_IO_PROG -d -c "pwrite -S 0xbb -b 256M 3M 256M" $SCRATCH_MNT/bar \
| _filter_xfs_io
# Now verify the file contents are correct and that they are the same
# even after unmounting and mounting the fs again (or evicting the page
# cache).
#
# For file foo, all bytes in the range [0, 640K[ must have a value of
# 0x00, all bytes in the range [640K, 720K[ must have a value of 0xbb
# and all bytes in the range [720K, 800K[ must have a value of 0xaa.
#
# For file bar, all bytes in the range [0, 3M[ must havea value of 0x00,
# all bytes in the range [3M, 259M[ must have a value of 0xbb and all
# bytes in the range [259M, 386M[ must have a value of 0xaa.
#
echo "File digests before remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
_scratch_remount
echo "File digests after remounting the file system:"
md5sum $SCRATCH_MNT/foo | _filter_scratch
md5sum $SCRATCH_MNT/bar | _filter_scratch
status=0
exit
Fixes: e1cbbfa5f5aa ("Btrfs: fix outstanding_extents accounting in DIO")
Fixes: 3e05bde8c3c2 ("Btrfs: only adjust outstanding_extents when we do a short write")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-11-04 17:52:04 +08:00
|
|
|
/*
|
|
|
|
* Compensate the delalloc release we do in btrfs_direct_IO() when we
|
|
|
|
* write less data then expected, so that we don't underflow our inode's
|
|
|
|
* outstanding extents counter.
|
|
|
|
*/
|
|
|
|
if (create && dio_data)
|
|
|
|
adjust_dio_outstanding_extents(inode, dio_data, len);
|
|
|
|
|
2012-08-01 04:28:48 +08:00
|
|
|
return ret;
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
|
|
|
|
int rw, int mirror_num)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(rw & REQ_WRITE);
|
|
|
|
|
|
|
|
bio_get(bio);
|
|
|
|
|
|
|
|
ret = btrfs_bio_wq_end_io(root->fs_info, bio,
|
|
|
|
BTRFS_WQ_ENDIO_DIO_REPAIR);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
|
|
|
|
err:
|
|
|
|
bio_put(bio);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_check_dio_repairable(struct inode *inode,
|
|
|
|
struct bio *failed_bio,
|
|
|
|
struct io_failure_record *failrec,
|
|
|
|
int failed_mirror)
|
|
|
|
{
|
|
|
|
int num_copies;
|
|
|
|
|
|
|
|
num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
|
|
|
|
failrec->logical, failrec->len);
|
|
|
|
if (num_copies == 1) {
|
|
|
|
/*
|
|
|
|
* we only have a single copy of the data, so don't bother with
|
|
|
|
* all the retry and error correction code that follows. no
|
|
|
|
* matter what the error is, it is very likely to persist.
|
|
|
|
*/
|
|
|
|
pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
|
|
|
|
num_copies, failrec->this_mirror, failed_mirror);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
failrec->failed_mirror = failed_mirror;
|
|
|
|
failrec->this_mirror++;
|
|
|
|
if (failrec->this_mirror == failed_mirror)
|
|
|
|
failrec->this_mirror++;
|
|
|
|
|
|
|
|
if (failrec->this_mirror > num_copies) {
|
|
|
|
pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
|
|
|
|
num_copies, failrec->this_mirror, failed_mirror);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int dio_read_error(struct inode *inode, struct bio *failed_bio,
|
|
|
|
struct page *page, u64 start, u64 end,
|
|
|
|
int failed_mirror, bio_end_io_t *repair_endio,
|
|
|
|
void *repair_arg)
|
|
|
|
{
|
|
|
|
struct io_failure_record *failrec;
|
|
|
|
struct bio *bio;
|
|
|
|
int isector;
|
|
|
|
int read_mode;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
BUG_ON(failed_bio->bi_rw & REQ_WRITE);
|
|
|
|
|
|
|
|
ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
|
|
|
|
failed_mirror);
|
|
|
|
if (!ret) {
|
|
|
|
free_io_failure(inode, failrec);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (failed_bio->bi_vcnt > 1)
|
|
|
|
read_mode = READ_SYNC | REQ_FAILFAST_DEV;
|
|
|
|
else
|
|
|
|
read_mode = READ_SYNC;
|
|
|
|
|
|
|
|
isector = start - btrfs_io_bio(failed_bio)->logical;
|
|
|
|
isector >>= inode->i_sb->s_blocksize_bits;
|
|
|
|
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
|
|
|
|
0, isector, repair_endio, repair_arg);
|
|
|
|
if (!bio) {
|
|
|
|
free_io_failure(inode, failrec);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_debug(BTRFS_I(inode)->root->fs_info,
|
|
|
|
"Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
|
|
|
|
read_mode, failrec->this_mirror, failrec->in_validation);
|
|
|
|
|
|
|
|
ret = submit_dio_repair_bio(inode, bio, read_mode,
|
|
|
|
failrec->this_mirror);
|
|
|
|
if (ret) {
|
|
|
|
free_io_failure(inode, failrec);
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct btrfs_retry_complete {
|
|
|
|
struct completion done;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 start;
|
|
|
|
int uptodate;
|
|
|
|
};
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void btrfs_retry_endio_nocsum(struct bio *bio)
|
2014-09-12 18:44:03 +08:00
|
|
|
{
|
|
|
|
struct btrfs_retry_complete *done = bio->bi_private;
|
|
|
|
struct bio_vec *bvec;
|
|
|
|
int i;
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
if (bio->bi_error)
|
2014-09-12 18:44:03 +08:00
|
|
|
goto end;
|
|
|
|
|
|
|
|
done->uptodate = 1;
|
|
|
|
bio_for_each_segment_all(bvec, bio, i)
|
|
|
|
clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
|
|
|
|
end:
|
|
|
|
complete(&done->done);
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __btrfs_correct_data_nocsum(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio)
|
2010-05-23 23:00:55 +08:00
|
|
|
{
|
2013-11-08 04:20:26 +08:00
|
|
|
struct bio_vec *bvec;
|
2014-09-12 18:44:03 +08:00
|
|
|
struct btrfs_retry_complete done;
|
2010-05-23 23:00:55 +08:00
|
|
|
u64 start;
|
2013-11-08 04:20:26 +08:00
|
|
|
int i;
|
2014-09-12 18:43:56 +08:00
|
|
|
int ret;
|
2010-05-23 23:00:55 +08:00
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
start = io_bio->logical;
|
|
|
|
done.inode = inode;
|
|
|
|
|
|
|
|
bio_for_each_segment_all(bvec, &io_bio->bio, i) {
|
|
|
|
try_again:
|
|
|
|
done.uptodate = 0;
|
|
|
|
done.start = start;
|
|
|
|
init_completion(&done.done);
|
|
|
|
|
|
|
|
ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
|
|
|
|
start + bvec->bv_len - 1,
|
|
|
|
io_bio->mirror_num,
|
|
|
|
btrfs_retry_endio_nocsum, &done);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
wait_for_completion(&done.done);
|
|
|
|
|
|
|
|
if (!done.uptodate) {
|
|
|
|
/* We might have another mirror, so try again */
|
|
|
|
goto try_again;
|
|
|
|
}
|
|
|
|
|
|
|
|
start += bvec->bv_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void btrfs_retry_endio(struct bio *bio)
|
2014-09-12 18:44:03 +08:00
|
|
|
{
|
|
|
|
struct btrfs_retry_complete *done = bio->bi_private;
|
|
|
|
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
|
|
|
|
struct bio_vec *bvec;
|
|
|
|
int uptodate;
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
if (bio->bi_error)
|
2014-09-12 18:44:03 +08:00
|
|
|
goto end;
|
|
|
|
|
|
|
|
uptodate = 1;
|
|
|
|
bio_for_each_segment_all(bvec, bio, i) {
|
|
|
|
ret = __readpage_endio_check(done->inode, io_bio, i,
|
|
|
|
bvec->bv_page, 0,
|
|
|
|
done->start, bvec->bv_len);
|
|
|
|
if (!ret)
|
|
|
|
clean_io_failure(done->inode, done->start,
|
|
|
|
bvec->bv_page, 0);
|
|
|
|
else
|
|
|
|
uptodate = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
done->uptodate = uptodate;
|
|
|
|
end:
|
|
|
|
complete(&done->done);
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __btrfs_subio_endio_read(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio, int err)
|
|
|
|
{
|
|
|
|
struct bio_vec *bvec;
|
|
|
|
struct btrfs_retry_complete done;
|
|
|
|
u64 start;
|
|
|
|
u64 offset = 0;
|
|
|
|
int i;
|
|
|
|
int ret;
|
2014-09-12 18:43:55 +08:00
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
err = 0;
|
2014-09-12 18:43:56 +08:00
|
|
|
start = io_bio->logical;
|
2014-09-12 18:44:03 +08:00
|
|
|
done.inode = inode;
|
|
|
|
|
2014-09-12 18:43:56 +08:00
|
|
|
bio_for_each_segment_all(bvec, &io_bio->bio, i) {
|
2014-09-12 18:43:55 +08:00
|
|
|
ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
|
|
|
|
0, start, bvec->bv_len);
|
2014-09-12 18:44:03 +08:00
|
|
|
if (likely(!ret))
|
|
|
|
goto next;
|
|
|
|
try_again:
|
|
|
|
done.uptodate = 0;
|
|
|
|
done.start = start;
|
|
|
|
init_completion(&done.done);
|
|
|
|
|
|
|
|
ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
|
|
|
|
start + bvec->bv_len - 1,
|
|
|
|
io_bio->mirror_num,
|
|
|
|
btrfs_retry_endio, &done);
|
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
wait_for_completion(&done.done);
|
|
|
|
|
|
|
|
if (!done.uptodate) {
|
|
|
|
/* We might have another mirror, so try again */
|
|
|
|
goto try_again;
|
|
|
|
}
|
|
|
|
next:
|
|
|
|
offset += bvec->bv_len;
|
2010-05-23 23:00:55 +08:00
|
|
|
start += bvec->bv_len;
|
2013-11-08 04:20:26 +08:00
|
|
|
}
|
2014-09-12 18:43:56 +08:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
static int btrfs_subio_endio_read(struct inode *inode,
|
|
|
|
struct btrfs_io_bio *io_bio, int err)
|
|
|
|
{
|
|
|
|
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
|
|
|
|
|
|
|
if (skip_csum) {
|
|
|
|
if (unlikely(err))
|
|
|
|
return __btrfs_correct_data_nocsum(inode, io_bio);
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
return __btrfs_subio_endio_read(inode, io_bio, err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void btrfs_endio_direct_read(struct bio *bio)
|
2014-09-12 18:43:56 +08:00
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
struct inode *inode = dip->inode;
|
|
|
|
struct bio *dio_bio;
|
|
|
|
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
|
2015-07-20 21:29:37 +08:00
|
|
|
int err = bio->bi_error;
|
2014-09-12 18:43:56 +08:00
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
|
|
|
|
err = btrfs_subio_endio_read(inode, io_bio, err);
|
2014-09-12 18:43:56 +08:00
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
|
2012-03-01 21:57:19 +08:00
|
|
|
dip->logical_offset + dip->bytes - 1);
|
2013-05-18 06:30:14 +08:00
|
|
|
dio_bio = dip->dio_bio;
|
2010-05-23 23:00:55 +08:00
|
|
|
|
|
|
|
kfree(dip);
|
2011-03-22 23:05:07 +08:00
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
dio_end_io(dio_bio, bio->bi_error);
|
2014-09-12 18:43:54 +08:00
|
|
|
|
|
|
|
if (io_bio->end_io)
|
|
|
|
io_bio->end_io(io_bio, err);
|
2013-05-18 06:30:14 +08:00
|
|
|
bio_put(bio);
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
2015-11-25 00:23:54 +08:00
|
|
|
static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
|
|
|
|
const u64 offset,
|
|
|
|
const u64 bytes,
|
|
|
|
const int uptodate)
|
2010-05-23 23:00:55 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_ordered_extent *ordered = NULL;
|
2015-11-25 00:23:54 +08:00
|
|
|
u64 ordered_offset = offset;
|
|
|
|
u64 ordered_bytes = bytes;
|
2010-05-23 23:00:55 +08:00
|
|
|
int ret;
|
|
|
|
|
2010-11-29 08:56:33 +08:00
|
|
|
again:
|
|
|
|
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
|
|
|
|
&ordered_offset,
|
2015-07-20 21:29:37 +08:00
|
|
|
ordered_bytes,
|
2015-11-25 00:23:54 +08:00
|
|
|
uptodate);
|
2010-05-23 23:00:55 +08:00
|
|
|
if (!ret)
|
2010-11-29 08:56:33 +08:00
|
|
|
goto out_test;
|
2010-05-23 23:00:55 +08:00
|
|
|
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
|
|
|
|
finish_ordered_fn, NULL, NULL);
|
2014-02-28 10:46:10 +08:00
|
|
|
btrfs_queue_work(root->fs_info->endio_write_workers,
|
|
|
|
&ordered->work);
|
2010-11-29 08:56:33 +08:00
|
|
|
out_test:
|
|
|
|
/*
|
|
|
|
* our bio might span multiple ordered extents. If we haven't
|
|
|
|
* completed the accounting for the whole dio, go back and try again
|
|
|
|
*/
|
2015-11-25 00:23:54 +08:00
|
|
|
if (ordered_offset < offset + bytes) {
|
|
|
|
ordered_bytes = offset + bytes - ordered_offset;
|
2012-05-03 02:00:54 +08:00
|
|
|
ordered = NULL;
|
2010-11-29 08:56:33 +08:00
|
|
|
goto again;
|
|
|
|
}
|
2015-11-25 00:23:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_endio_direct_write(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
struct bio *dio_bio = dip->dio_bio;
|
|
|
|
|
|
|
|
btrfs_endio_direct_write_update_ordered(dip->inode,
|
|
|
|
dip->logical_offset,
|
|
|
|
dip->bytes,
|
|
|
|
!bio->bi_error);
|
2010-05-23 23:00:55 +08:00
|
|
|
|
|
|
|
kfree(dip);
|
2011-03-22 23:05:07 +08:00
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
dio_end_io(dio_bio, bio->bi_error);
|
2013-05-18 06:30:14 +08:00
|
|
|
bio_put(bio);
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
2010-05-25 21:48:28 +08:00
|
|
|
static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
|
|
|
|
struct bio *bio, int mirror_num,
|
|
|
|
unsigned long bio_flags, u64 offset)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
|
2012-03-12 23:03:00 +08:00
|
|
|
BUG_ON(ret); /* -ENOMEM */
|
2010-05-25 21:48:28 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void btrfs_end_dio_bio(struct bio *bio)
|
2010-11-22 11:04:43 +08:00
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
2015-07-20 21:29:37 +08:00
|
|
|
int err = bio->bi_error;
|
2010-11-22 11:04:43 +08:00
|
|
|
|
2014-09-12 18:44:03 +08:00
|
|
|
if (err)
|
|
|
|
btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
|
|
|
|
"direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
|
|
|
|
btrfs_ino(dip->inode), bio->bi_rw,
|
|
|
|
(unsigned long long)bio->bi_iter.bi_sector,
|
|
|
|
bio->bi_iter.bi_size, err);
|
|
|
|
|
|
|
|
if (dip->subio_endio)
|
|
|
|
err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
|
2014-09-12 18:43:56 +08:00
|
|
|
|
|
|
|
if (err) {
|
2010-11-22 11:04:43 +08:00
|
|
|
dip->errors = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* before atomic variable goto zero, we must make sure
|
|
|
|
* dip->errors is perceived to be set.
|
|
|
|
*/
|
2014-03-18 01:06:10 +08:00
|
|
|
smp_mb__before_atomic();
|
2010-11-22 11:04:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* if there are more bios still pending for this dio, just exit */
|
|
|
|
if (!atomic_dec_and_test(&dip->pending_bios))
|
|
|
|
goto out;
|
|
|
|
|
2013-05-18 06:30:14 +08:00
|
|
|
if (dip->errors) {
|
2010-11-22 11:04:43 +08:00
|
|
|
bio_io_error(dip->orig_bio);
|
2013-05-18 06:30:14 +08:00
|
|
|
} else {
|
2015-07-20 21:29:37 +08:00
|
|
|
dip->dio_bio->bi_error = 0;
|
|
|
|
bio_endio(dip->orig_bio);
|
2010-11-22 11:04:43 +08:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
|
|
|
|
u64 first_sector, gfp_t gfp_flags)
|
|
|
|
{
|
2015-07-03 04:57:22 +08:00
|
|
|
struct bio *bio;
|
2015-09-06 06:14:43 +08:00
|
|
|
bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
|
2015-07-03 04:57:22 +08:00
|
|
|
if (bio)
|
|
|
|
bio_associate_current(bio);
|
|
|
|
return bio;
|
2010-11-22 11:04:43 +08:00
|
|
|
}
|
|
|
|
|
2014-09-12 18:43:56 +08:00
|
|
|
static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
|
|
|
|
struct inode *inode,
|
|
|
|
struct btrfs_dio_private *dip,
|
|
|
|
struct bio *bio,
|
|
|
|
u64 file_offset)
|
|
|
|
{
|
|
|
|
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
|
|
|
|
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We load all the csum data we need when we submit
|
|
|
|
* the first bio to reduce the csum tree search and
|
|
|
|
* contention.
|
|
|
|
*/
|
|
|
|
if (dip->logical_offset == file_offset) {
|
|
|
|
ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
|
|
|
|
file_offset);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bio == dip->orig_bio)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
file_offset -= dip->logical_offset;
|
|
|
|
file_offset >>= inode->i_sb->s_blocksize_bits;
|
|
|
|
io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-11-22 11:04:43 +08:00
|
|
|
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
|
|
|
|
int rw, u64 file_offset, int skip_sum,
|
2012-08-04 04:49:19 +08:00
|
|
|
int async_submit)
|
2010-11-22 11:04:43 +08:00
|
|
|
{
|
2013-07-25 19:22:34 +08:00
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
2010-11-22 11:04:43 +08:00
|
|
|
int write = rw & REQ_WRITE;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret;
|
|
|
|
|
2012-11-17 02:56:32 +08:00
|
|
|
if (async_submit)
|
|
|
|
async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
|
|
|
|
|
2010-11-22 11:04:43 +08:00
|
|
|
bio_get(bio);
|
2012-05-03 02:00:54 +08:00
|
|
|
|
|
|
|
if (!write) {
|
2014-07-30 06:25:45 +08:00
|
|
|
ret = btrfs_bio_wq_end_io(root->fs_info, bio,
|
|
|
|
BTRFS_WQ_ENDIO_DATA);
|
2012-05-03 02:00:54 +08:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2010-11-22 11:04:43 +08:00
|
|
|
|
2011-04-07 02:41:34 +08:00
|
|
|
if (skip_sum)
|
|
|
|
goto map;
|
|
|
|
|
|
|
|
if (write && async_submit) {
|
2010-11-22 11:04:43 +08:00
|
|
|
ret = btrfs_wq_submit_bio(root->fs_info,
|
|
|
|
inode, rw, bio, 0, 0,
|
|
|
|
file_offset,
|
|
|
|
__btrfs_submit_bio_start_direct_io,
|
|
|
|
__btrfs_submit_bio_done);
|
|
|
|
goto err;
|
2011-04-07 02:41:34 +08:00
|
|
|
} else if (write) {
|
|
|
|
/*
|
|
|
|
* If we aren't doing async submit, calculate the csum of the
|
|
|
|
* bio now.
|
|
|
|
*/
|
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
2014-09-12 18:43:54 +08:00
|
|
|
} else {
|
2014-09-12 18:43:56 +08:00
|
|
|
ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
|
|
|
|
file_offset);
|
2011-03-01 14:48:31 +08:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2011-04-07 02:41:34 +08:00
|
|
|
map:
|
|
|
|
ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
|
2010-11-22 11:04:43 +08:00
|
|
|
err:
|
|
|
|
bio_put(bio);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
|
|
|
|
int skip_sum)
|
|
|
|
{
|
|
|
|
struct inode *inode = dip->inode;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct bio *bio;
|
|
|
|
struct bio *orig_bio = dip->orig_bio;
|
|
|
|
struct bio_vec *bvec = orig_bio->bi_io_vec;
|
2013-10-12 06:44:27 +08:00
|
|
|
u64 start_sector = orig_bio->bi_iter.bi_sector;
|
2010-11-22 11:04:43 +08:00
|
|
|
u64 file_offset = dip->logical_offset;
|
|
|
|
u64 submit_len = 0;
|
|
|
|
u64 map_length;
|
|
|
|
int nr_pages = 0;
|
2014-09-12 18:43:54 +08:00
|
|
|
int ret;
|
2011-04-07 02:41:34 +08:00
|
|
|
int async_submit = 0;
|
2010-11-22 11:04:43 +08:00
|
|
|
|
2013-10-12 06:44:27 +08:00
|
|
|
map_length = orig_bio->bi_iter.bi_size;
|
2013-01-30 07:40:14 +08:00
|
|
|
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
|
2010-11-22 11:04:43 +08:00
|
|
|
&map_length, NULL, 0);
|
2014-06-17 18:58:59 +08:00
|
|
|
if (ret)
|
2010-11-22 11:04:43 +08:00
|
|
|
return -EIO;
|
2013-07-25 19:22:34 +08:00
|
|
|
|
2013-10-12 06:44:27 +08:00
|
|
|
if (map_length >= orig_bio->bi_iter.bi_size) {
|
2011-04-07 02:25:44 +08:00
|
|
|
bio = orig_bio;
|
2014-09-12 18:43:56 +08:00
|
|
|
dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
|
2011-04-07 02:25:44 +08:00
|
|
|
goto submit;
|
|
|
|
}
|
|
|
|
|
2013-01-30 07:40:14 +08:00
|
|
|
/* async crcs make it difficult to collect full stripe writes. */
|
2015-01-20 15:11:44 +08:00
|
|
|
if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
|
2013-01-30 07:40:14 +08:00
|
|
|
async_submit = 0;
|
|
|
|
else
|
|
|
|
async_submit = 1;
|
|
|
|
|
2011-04-07 02:25:44 +08:00
|
|
|
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
|
|
|
|
if (!bio)
|
|
|
|
return -ENOMEM;
|
2014-06-17 18:58:59 +08:00
|
|
|
|
2011-04-07 02:25:44 +08:00
|
|
|
bio->bi_private = dip;
|
|
|
|
bio->bi_end_io = btrfs_end_dio_bio;
|
2014-09-12 18:43:56 +08:00
|
|
|
btrfs_io_bio(bio)->logical = file_offset;
|
2011-04-07 02:25:44 +08:00
|
|
|
atomic_inc(&dip->pending_bios);
|
|
|
|
|
2010-11-22 11:04:43 +08:00
|
|
|
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
|
2014-09-30 07:33:33 +08:00
|
|
|
if (map_length < submit_len + bvec->bv_len ||
|
2010-11-22 11:04:43 +08:00
|
|
|
bio_add_page(bio, bvec->bv_page, bvec->bv_len,
|
2014-09-30 07:33:33 +08:00
|
|
|
bvec->bv_offset) < bvec->bv_len) {
|
2010-11-22 11:04:43 +08:00
|
|
|
/*
|
|
|
|
* inc the count before we submit the bio so
|
|
|
|
* we know the end IO handler won't happen before
|
|
|
|
* we inc the count. Otherwise, the dip might get freed
|
|
|
|
* before we're done setting it up
|
|
|
|
*/
|
|
|
|
atomic_inc(&dip->pending_bios);
|
|
|
|
ret = __btrfs_submit_dio_bio(bio, inode, rw,
|
|
|
|
file_offset, skip_sum,
|
2012-08-04 04:49:19 +08:00
|
|
|
async_submit);
|
2010-11-22 11:04:43 +08:00
|
|
|
if (ret) {
|
|
|
|
bio_put(bio);
|
|
|
|
atomic_dec(&dip->pending_bios);
|
|
|
|
goto out_err;
|
|
|
|
}
|
|
|
|
|
|
|
|
start_sector += submit_len >> 9;
|
|
|
|
file_offset += submit_len;
|
|
|
|
|
|
|
|
submit_len = 0;
|
|
|
|
nr_pages = 0;
|
|
|
|
|
|
|
|
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
|
|
|
|
start_sector, GFP_NOFS);
|
|
|
|
if (!bio)
|
|
|
|
goto out_err;
|
|
|
|
bio->bi_private = dip;
|
|
|
|
bio->bi_end_io = btrfs_end_dio_bio;
|
2014-09-12 18:43:56 +08:00
|
|
|
btrfs_io_bio(bio)->logical = file_offset;
|
2010-11-22 11:04:43 +08:00
|
|
|
|
2013-10-12 06:44:27 +08:00
|
|
|
map_length = orig_bio->bi_iter.bi_size;
|
2013-01-30 07:40:14 +08:00
|
|
|
ret = btrfs_map_block(root->fs_info, rw,
|
2012-11-05 22:46:42 +08:00
|
|
|
start_sector << 9,
|
2010-11-22 11:04:43 +08:00
|
|
|
&map_length, NULL, 0);
|
|
|
|
if (ret) {
|
|
|
|
bio_put(bio);
|
|
|
|
goto out_err;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
submit_len += bvec->bv_len;
|
2013-10-31 13:03:04 +08:00
|
|
|
nr_pages++;
|
2010-11-22 11:04:43 +08:00
|
|
|
bvec++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-07 02:25:44 +08:00
|
|
|
submit:
|
2010-11-22 11:04:43 +08:00
|
|
|
ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
|
2012-08-04 04:49:19 +08:00
|
|
|
async_submit);
|
2010-11-22 11:04:43 +08:00
|
|
|
if (!ret)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
bio_put(bio);
|
|
|
|
out_err:
|
|
|
|
dip->errors = 1;
|
|
|
|
/*
|
|
|
|
* before atomic variable goto zero, we must
|
|
|
|
* make sure dip->errors is perceived to be set.
|
|
|
|
*/
|
2014-03-18 01:06:10 +08:00
|
|
|
smp_mb__before_atomic();
|
2010-11-22 11:04:43 +08:00
|
|
|
if (atomic_dec_and_test(&dip->pending_bios))
|
|
|
|
bio_io_error(dip->orig_bio);
|
|
|
|
|
|
|
|
/* bio_end_io() will handle error, so we needn't return it */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-18 06:30:14 +08:00
|
|
|
static void btrfs_submit_direct(int rw, struct bio *dio_bio,
|
|
|
|
struct inode *inode, loff_t file_offset)
|
2010-05-23 23:00:55 +08:00
|
|
|
{
|
2015-07-01 19:13:10 +08:00
|
|
|
struct btrfs_dio_private *dip = NULL;
|
|
|
|
struct bio *io_bio = NULL;
|
2014-09-12 18:43:54 +08:00
|
|
|
struct btrfs_io_bio *btrfs_bio;
|
2010-05-23 23:00:55 +08:00
|
|
|
int skip_sum;
|
2010-08-08 00:20:39 +08:00
|
|
|
int write = rw & REQ_WRITE;
|
2010-05-23 23:00:55 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
|
|
|
|
2013-05-18 06:30:14 +08:00
|
|
|
io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
|
|
|
|
if (!io_bio) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto free_ordered;
|
|
|
|
}
|
|
|
|
|
2014-09-12 18:43:56 +08:00
|
|
|
dip = kzalloc(sizeof(*dip), GFP_NOFS);
|
2010-05-23 23:00:55 +08:00
|
|
|
if (!dip) {
|
|
|
|
ret = -ENOMEM;
|
2015-07-01 19:13:10 +08:00
|
|
|
goto free_ordered;
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
2013-05-18 06:30:14 +08:00
|
|
|
dip->private = dio_bio->bi_private;
|
2010-05-23 23:00:55 +08:00
|
|
|
dip->inode = inode;
|
|
|
|
dip->logical_offset = file_offset;
|
2013-10-12 06:44:27 +08:00
|
|
|
dip->bytes = dio_bio->bi_iter.bi_size;
|
|
|
|
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
|
2013-05-18 06:30:14 +08:00
|
|
|
io_bio->bi_private = dip;
|
|
|
|
dip->orig_bio = io_bio;
|
|
|
|
dip->dio_bio = dio_bio;
|
2010-11-22 11:04:43 +08:00
|
|
|
atomic_set(&dip->pending_bios, 0);
|
2014-09-12 18:43:56 +08:00
|
|
|
btrfs_bio = btrfs_io_bio(io_bio);
|
|
|
|
btrfs_bio->logical = file_offset;
|
2010-05-23 23:00:55 +08:00
|
|
|
|
2014-09-12 18:43:56 +08:00
|
|
|
if (write) {
|
2013-05-18 06:30:14 +08:00
|
|
|
io_bio->bi_end_io = btrfs_endio_direct_write;
|
2014-09-12 18:43:56 +08:00
|
|
|
} else {
|
2013-05-18 06:30:14 +08:00
|
|
|
io_bio->bi_end_io = btrfs_endio_direct_read;
|
2014-09-12 18:43:56 +08:00
|
|
|
dip->subio_endio = btrfs_subio_endio_read;
|
|
|
|
}
|
2010-05-23 23:00:55 +08:00
|
|
|
|
2015-12-09 03:23:20 +08:00
|
|
|
/*
|
|
|
|
* Reset the range for unsubmitted ordered extents (to a 0 length range)
|
|
|
|
* even if we fail to submit a bio, because in such case we do the
|
|
|
|
* corresponding error handling below and it must not be done a second
|
|
|
|
* time by btrfs_direct_IO().
|
|
|
|
*/
|
|
|
|
if (write) {
|
|
|
|
struct btrfs_dio_data *dio_data = current->journal_info;
|
|
|
|
|
|
|
|
dio_data->unsubmitted_oe_range_end = dip->logical_offset +
|
|
|
|
dip->bytes;
|
|
|
|
dio_data->unsubmitted_oe_range_start =
|
|
|
|
dio_data->unsubmitted_oe_range_end;
|
|
|
|
}
|
|
|
|
|
2010-11-22 11:04:43 +08:00
|
|
|
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
|
|
|
|
if (!ret)
|
2010-05-25 21:48:28 +08:00
|
|
|
return;
|
2013-05-18 06:30:14 +08:00
|
|
|
|
2014-09-12 18:43:54 +08:00
|
|
|
if (btrfs_bio->end_io)
|
|
|
|
btrfs_bio->end_io(btrfs_bio, ret);
|
2013-05-18 06:30:14 +08:00
|
|
|
|
2010-05-23 23:00:55 +08:00
|
|
|
free_ordered:
|
|
|
|
/*
|
2015-07-01 19:13:10 +08:00
|
|
|
* If we arrived here it means either we failed to submit the dip
|
|
|
|
* or we either failed to clone the dio_bio or failed to allocate the
|
|
|
|
* dip. If we cloned the dio_bio and allocated the dip, we can just
|
|
|
|
* call bio_endio against our io_bio so that we get proper resource
|
|
|
|
* cleanup if we fail to submit the dip, otherwise, we must do the
|
|
|
|
* same as btrfs_endio_direct_[write|read] because we can't call these
|
|
|
|
* callbacks - they require an allocated dip and a clone of dio_bio.
|
2010-05-23 23:00:55 +08:00
|
|
|
*/
|
2015-07-01 19:13:10 +08:00
|
|
|
if (io_bio && dip) {
|
2015-07-20 21:29:37 +08:00
|
|
|
io_bio->bi_error = -EIO;
|
|
|
|
bio_endio(io_bio);
|
2015-07-01 19:13:10 +08:00
|
|
|
/*
|
|
|
|
* The end io callbacks free our dip, do the final put on io_bio
|
|
|
|
* and all the cleanup and final put for dio_bio (through
|
|
|
|
* dio_end_io()).
|
|
|
|
*/
|
|
|
|
dip = NULL;
|
|
|
|
io_bio = NULL;
|
|
|
|
} else {
|
2015-11-25 00:23:54 +08:00
|
|
|
if (write)
|
|
|
|
btrfs_endio_direct_write_update_ordered(inode,
|
|
|
|
file_offset,
|
|
|
|
dio_bio->bi_iter.bi_size,
|
|
|
|
0);
|
|
|
|
else
|
2015-07-01 19:13:10 +08:00
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
|
|
|
|
file_offset + dio_bio->bi_iter.bi_size - 1);
|
2015-11-25 00:23:54 +08:00
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
dio_bio->bi_error = -EIO;
|
2015-07-01 19:13:10 +08:00
|
|
|
/*
|
|
|
|
* Releases and cleans up our dio_bio, no need to bio_put()
|
|
|
|
* nor bio_endio()/bio_io_error() against dio_bio.
|
|
|
|
*/
|
|
|
|
dio_end_io(dio_bio, ret);
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
2015-07-01 19:13:10 +08:00
|
|
|
if (io_bio)
|
|
|
|
bio_put(io_bio);
|
|
|
|
kfree(dip);
|
2010-05-23 23:00:55 +08:00
|
|
|
}
|
|
|
|
|
2015-03-16 19:33:52 +08:00
|
|
|
static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
|
2014-03-22 17:15:17 +08:00
|
|
|
const struct iov_iter *iter, loff_t offset)
|
2010-05-27 09:33:37 +08:00
|
|
|
{
|
|
|
|
int seg;
|
2011-04-08 23:51:18 +08:00
|
|
|
int i;
|
2010-05-27 09:33:37 +08:00
|
|
|
unsigned blocksize_mask = root->sectorsize - 1;
|
|
|
|
ssize_t retval = -EINVAL;
|
|
|
|
|
|
|
|
if (offset & blocksize_mask)
|
|
|
|
goto out;
|
|
|
|
|
2014-03-22 17:15:17 +08:00
|
|
|
if (iov_iter_alignment(iter) & blocksize_mask)
|
|
|
|
goto out;
|
2011-04-08 23:51:18 +08:00
|
|
|
|
2014-03-22 17:15:17 +08:00
|
|
|
/* If this is a write we don't need to check anymore */
|
2015-03-16 19:33:52 +08:00
|
|
|
if (iov_iter_rw(iter) == WRITE)
|
2014-03-22 17:15:17 +08:00
|
|
|
return 0;
|
|
|
|
/*
|
|
|
|
* Check to make sure we don't have duplicate iov_base's in this
|
|
|
|
* iovec, if so return EINVAL, otherwise we'll get csum errors
|
|
|
|
* when reading back.
|
|
|
|
*/
|
|
|
|
for (seg = 0; seg < iter->nr_segs; seg++) {
|
|
|
|
for (i = seg + 1; i < iter->nr_segs; i++) {
|
|
|
|
if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
|
2011-04-08 23:51:18 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2010-05-27 09:33:37 +08:00
|
|
|
}
|
|
|
|
retval = 0;
|
|
|
|
out:
|
|
|
|
return retval;
|
|
|
|
}
|
2012-08-01 04:28:48 +08:00
|
|
|
|
2015-03-16 19:33:53 +08:00
|
|
|
static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
|
|
|
loff_t offset)
|
2008-04-10 22:23:21 +08:00
|
|
|
{
|
2010-05-23 23:00:55 +08:00
|
|
|
struct file *file = iocb->ki_filp;
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
2015-08-28 23:40:13 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_dio_data dio_data = { 0 };
|
2013-02-07 18:12:07 +08:00
|
|
|
size_t count = 0;
|
2013-02-08 15:01:08 +08:00
|
|
|
int flags = 0;
|
2013-02-08 15:04:11 +08:00
|
|
|
bool wakeup = true;
|
|
|
|
bool relock = false;
|
2013-02-07 18:12:07 +08:00
|
|
|
ssize_t ret;
|
2010-05-23 23:00:55 +08:00
|
|
|
|
2015-03-16 19:33:52 +08:00
|
|
|
if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset))
|
2010-05-27 09:33:37 +08:00
|
|
|
return 0;
|
2010-05-26 22:59:53 +08:00
|
|
|
|
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.
For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:
clat percentiles (usec):
| 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34],
| 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35],
| 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80],
| 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155],
| 99.99th=[ 165]
After:
clat percentiles (usec):
| 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149],
| 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171],
| 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270],
| 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422],
| 99.99th=[ 438]
In other setups, Robert Elliott reported seeing good performance
improvements:
https://lkml.org/lkml/2015/4/3/557
The more applications accessing the device, the worse it gets.
Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-04-16 07:05:48 +08:00
|
|
|
inode_dio_begin(inode);
|
2014-03-18 01:06:10 +08:00
|
|
|
smp_mb__after_atomic();
|
2013-02-08 15:04:11 +08:00
|
|
|
|
2013-07-02 22:38:02 +08:00
|
|
|
/*
|
Btrfs: just do dirty page flush for the inode with compression before direct IO
As the comment in the btrfs_direct_IO says, only the compressed pages need be
flush again to make sure they are on the disk, but the common pages needn't,
so we add a if statement to check if the inode has compressed pages or not,
if no, skip the flush.
And in order to prevent the write ranges from intersecting, we need wait for
the running ordered extents. But the current code waits for them twice, one
is done before the direct IO starts (in btrfs_wait_ordered_range()), the other
is before we get the blocks, it is unnecessary. because we can do the direct
IO without holding i_mutex, it means that the intersected ordered extents may
happen during the direct IO, the first wait can not avoid this problem. So we
use filemap_fdatawrite_range() instead of btrfs_wait_ordered_range() to remove
the first wait.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-03-06 13:54:57 +08:00
|
|
|
* The generic stuff only does filemap_write_and_wait_range, which
|
|
|
|
* isn't enough if we've written compressed pages to this area, so
|
|
|
|
* we need to flush the dirty pages again to make absolutely sure
|
|
|
|
* that any outstanding dirty pages are on disk.
|
2013-07-02 22:38:02 +08:00
|
|
|
*/
|
2014-03-05 11:38:00 +08:00
|
|
|
count = iov_iter_count(iter);
|
Btrfs: just do dirty page flush for the inode with compression before direct IO
As the comment in the btrfs_direct_IO says, only the compressed pages need be
flush again to make sure they are on the disk, but the common pages needn't,
so we add a if statement to check if the inode has compressed pages or not,
if no, skip the flush.
And in order to prevent the write ranges from intersecting, we need wait for
the running ordered extents. But the current code waits for them twice, one
is done before the direct IO starts (in btrfs_wait_ordered_range()), the other
is before we get the blocks, it is unnecessary. because we can do the direct
IO without holding i_mutex, it means that the intersected ordered extents may
happen during the direct IO, the first wait can not avoid this problem. So we
use filemap_fdatawrite_range() instead of btrfs_wait_ordered_range() to remove
the first wait.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-03-06 13:54:57 +08:00
|
|
|
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2014-07-17 11:44:13 +08:00
|
|
|
filemap_fdatawrite_range(inode->i_mapping, offset,
|
|
|
|
offset + count - 1);
|
2013-07-02 22:38:02 +08:00
|
|
|
|
2015-03-16 19:33:52 +08:00
|
|
|
if (iov_iter_rw(iter) == WRITE) {
|
2013-02-08 15:04:11 +08:00
|
|
|
/*
|
|
|
|
* If the write DIO is beyond the EOF, we need update
|
|
|
|
* the isize, but it is protected by i_mutex. So we can
|
|
|
|
* not unlock the i_mutex at this case.
|
|
|
|
*/
|
|
|
|
if (offset + count <= inode->i_size) {
|
|
|
|
mutex_unlock(&inode->i_mutex);
|
|
|
|
relock = true;
|
|
|
|
}
|
2015-09-08 17:25:55 +08:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, offset, count);
|
2013-02-07 18:12:07 +08:00
|
|
|
if (ret)
|
2013-02-08 15:04:11 +08:00
|
|
|
goto out;
|
2015-08-28 23:40:13 +08:00
|
|
|
dio_data.outstanding_extents = div64_u64(count +
|
2015-03-17 22:52:28 +08:00
|
|
|
BTRFS_MAX_EXTENT_SIZE - 1,
|
|
|
|
BTRFS_MAX_EXTENT_SIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to know how many extents we reserved so that we can
|
|
|
|
* do the accounting properly if we go over the number we
|
|
|
|
* originally calculated. Abuse current->journal_info for this.
|
|
|
|
*/
|
2015-08-28 23:40:13 +08:00
|
|
|
dio_data.reserve = round_up(count, root->sectorsize);
|
2015-12-09 03:23:20 +08:00
|
|
|
dio_data.unsubmitted_oe_range_start = (u64)offset;
|
|
|
|
dio_data.unsubmitted_oe_range_end = (u64)offset;
|
2015-08-28 23:40:13 +08:00
|
|
|
current->journal_info = &dio_data;
|
2014-09-30 07:33:33 +08:00
|
|
|
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
|
|
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.
For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:
clat percentiles (usec):
| 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34],
| 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35],
| 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80],
| 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155],
| 99.99th=[ 165]
After:
clat percentiles (usec):
| 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149],
| 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171],
| 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270],
| 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422],
| 99.99th=[ 438]
In other setups, Robert Elliott reported seeing good performance
improvements:
https://lkml.org/lkml/2015/4/3/557
The more applications accessing the device, the worse it gets.
Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-04-16 07:05:48 +08:00
|
|
|
inode_dio_end(inode);
|
2013-02-08 15:04:11 +08:00
|
|
|
flags = DIO_LOCKING | DIO_SKIP_HOLES;
|
|
|
|
wakeup = false;
|
2013-02-07 18:12:07 +08:00
|
|
|
}
|
|
|
|
|
2015-03-16 19:33:50 +08:00
|
|
|
ret = __blockdev_direct_IO(iocb, inode,
|
|
|
|
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
|
|
|
|
iter, offset, btrfs_get_blocks_direct, NULL,
|
|
|
|
btrfs_submit_direct, flags);
|
2015-03-16 19:33:52 +08:00
|
|
|
if (iov_iter_rw(iter) == WRITE) {
|
2015-03-17 22:52:28 +08:00
|
|
|
current->journal_info = NULL;
|
2015-06-17 16:59:58 +08:00
|
|
|
if (ret < 0 && ret != -EIOCBQUEUED) {
|
2015-08-28 23:40:13 +08:00
|
|
|
if (dio_data.reserve)
|
2015-09-08 17:25:55 +08:00
|
|
|
btrfs_delalloc_release_space(inode, offset,
|
|
|
|
dio_data.reserve);
|
2015-12-09 03:23:20 +08:00
|
|
|
/*
|
|
|
|
* On error we might have left some ordered extents
|
|
|
|
* without submitting corresponding bios for them, so
|
|
|
|
* cleanup them up to avoid other tasks getting them
|
|
|
|
* and waiting for them to complete forever.
|
|
|
|
*/
|
|
|
|
if (dio_data.unsubmitted_oe_range_start <
|
|
|
|
dio_data.unsubmitted_oe_range_end)
|
|
|
|
btrfs_endio_direct_write_update_ordered(inode,
|
|
|
|
dio_data.unsubmitted_oe_range_start,
|
|
|
|
dio_data.unsubmitted_oe_range_end -
|
|
|
|
dio_data.unsubmitted_oe_range_start,
|
|
|
|
0);
|
2015-06-17 16:59:58 +08:00
|
|
|
} else if (ret >= 0 && (size_t)ret < count)
|
2015-09-08 17:25:55 +08:00
|
|
|
btrfs_delalloc_release_space(inode, offset,
|
|
|
|
count - (size_t)ret);
|
2013-02-07 18:12:07 +08:00
|
|
|
}
|
2013-02-08 15:04:11 +08:00
|
|
|
out:
|
2013-02-08 15:01:08 +08:00
|
|
|
if (wakeup)
|
direct-io: only inc/dec inode->i_dio_count for file systems
do_blockdev_direct_IO() increments and decrements the inode
->i_dio_count for each IO operation. It does this to protect against
truncate of a file. Block devices don't need this sort of protection.
For a capable multiqueue setup, this atomic int is the only shared
state between applications accessing the device for O_DIRECT, and it
presents a scaling wall for that. In my testing, as much as 30% of
system time is spent incrementing and decrementing this value. A mixed
read/write workload improved from ~2.5M IOPS to ~9.6M IOPS, with
better latencies too. Before:
clat percentiles (usec):
| 1.00th=[ 33], 5.00th=[ 34], 10.00th=[ 34], 20.00th=[ 34],
| 30.00th=[ 34], 40.00th=[ 34], 50.00th=[ 35], 60.00th=[ 35],
| 70.00th=[ 35], 80.00th=[ 35], 90.00th=[ 37], 95.00th=[ 80],
| 99.00th=[ 98], 99.50th=[ 151], 99.90th=[ 155], 99.95th=[ 155],
| 99.99th=[ 165]
After:
clat percentiles (usec):
| 1.00th=[ 95], 5.00th=[ 108], 10.00th=[ 129], 20.00th=[ 149],
| 30.00th=[ 155], 40.00th=[ 161], 50.00th=[ 167], 60.00th=[ 171],
| 70.00th=[ 177], 80.00th=[ 185], 90.00th=[ 201], 95.00th=[ 270],
| 99.00th=[ 390], 99.50th=[ 398], 99.90th=[ 418], 99.95th=[ 422],
| 99.99th=[ 438]
In other setups, Robert Elliott reported seeing good performance
improvements:
https://lkml.org/lkml/2015/4/3/557
The more applications accessing the device, the worse it gets.
Add a new direct-io flags, DIO_SKIP_DIO_COUNT, which tells
do_blockdev_direct_IO() that it need not worry about incrementing
or decrementing the inode i_dio_count for this caller.
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Elliott, Robert (Server Storage) <elliott@hp.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-04-16 07:05:48 +08:00
|
|
|
inode_dio_end(inode);
|
2013-02-08 15:04:11 +08:00
|
|
|
if (relock)
|
|
|
|
mutex_lock(&inode->i_mutex);
|
2013-02-07 18:12:07 +08:00
|
|
|
|
|
|
|
return ret;
|
2008-04-10 22:23:21 +08:00
|
|
|
}
|
|
|
|
|
2012-11-29 13:08:26 +08:00
|
|
|
#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
|
|
|
|
|
2009-01-22 03:39:14 +08:00
|
|
|
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
|
|
|
__u64 start, __u64 len)
|
|
|
|
{
|
2012-11-29 13:08:26 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2011-02-24 05:23:20 +08:00
|
|
|
return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
|
2009-01-22 03:39:14 +08:00
|
|
|
}
|
|
|
|
|
2007-08-28 04:49:44 +08:00
|
|
|
int btrfs_readpage(struct file *file, struct page *page)
|
2007-06-16 01:50:00 +08:00
|
|
|
{
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
2011-06-14 02:02:58 +08:00
|
|
|
return extent_read_full_page(tree, page, btrfs_get_extent, 0);
|
2007-06-16 01:50:00 +08:00
|
|
|
}
|
2007-12-22 05:27:21 +08:00
|
|
|
|
2007-08-28 04:49:44 +08:00
|
|
|
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *tree;
|
2015-10-23 03:05:09 +08:00
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
int ret;
|
2007-08-28 04:49:44 +08:00
|
|
|
|
|
|
|
if (current->flags & PF_MEMALLOC) {
|
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
|
|
unlock_page(page);
|
|
|
|
return 0;
|
|
|
|
}
|
2015-10-23 03:05:09 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are under memory pressure we will call this directly from the
|
|
|
|
* VM, we need to make sure we have the inode referenced for the ordered
|
|
|
|
* extent. If not just return like we didn't do anything.
|
|
|
|
*/
|
|
|
|
if (!igrab(inode)) {
|
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
|
|
return AOP_WRITEPAGE_ACTIVATE;
|
|
|
|
}
|
2008-01-25 05:13:08 +08:00
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
2015-10-23 03:05:09 +08:00
|
|
|
ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
|
|
|
|
btrfs_add_delayed_iput(inode);
|
|
|
|
return ret;
|
2007-06-16 01:50:00 +08:00
|
|
|
}
|
|
|
|
|
2013-04-26 04:41:01 +08:00
|
|
|
static int btrfs_writepages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc)
|
2007-11-02 07:45:34 +08:00
|
|
|
{
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *tree;
|
2008-11-07 11:02:51 +08:00
|
|
|
|
2008-01-25 05:13:08 +08:00
|
|
|
tree = &BTRFS_I(mapping->host)->io_tree;
|
2007-11-02 07:45:34 +08:00
|
|
|
return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
|
|
|
|
}
|
|
|
|
|
2007-11-08 23:59:22 +08:00
|
|
|
static int
|
|
|
|
btrfs_readpages(struct file *file, struct address_space *mapping,
|
|
|
|
struct list_head *pages, unsigned nr_pages)
|
|
|
|
{
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
tree = &BTRFS_I(mapping->host)->io_tree;
|
2007-11-08 23:59:22 +08:00
|
|
|
return extent_readpages(tree, mapping, pages, nr_pages,
|
|
|
|
btrfs_get_extent);
|
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
2007-06-16 01:50:00 +08:00
|
|
|
{
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
struct extent_map_tree *map;
|
2007-08-28 04:49:44 +08:00
|
|
|
int ret;
|
2007-06-18 21:57:58 +08:00
|
|
|
|
2008-01-25 05:13:08 +08:00
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
|
|
|
map = &BTRFS_I(page->mapping->host)->extent_tree;
|
2008-01-29 22:59:12 +08:00
|
|
|
ret = try_release_extent_mapping(map, tree, page, gfp_flags);
|
2007-08-28 04:49:44 +08:00
|
|
|
if (ret == 1) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
|
|
|
page_cache_release(page);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2007-08-28 04:49:44 +08:00
|
|
|
return ret;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
|
|
|
{
|
2008-09-12 03:51:43 +08:00
|
|
|
if (PageWriteback(page) || PageDirty(page))
|
|
|
|
return 0;
|
2009-02-12 23:06:04 +08:00
|
|
|
return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
|
2008-07-18 00:53:50 +08:00
|
|
|
}
|
|
|
|
|
2013-05-22 11:17:23 +08:00
|
|
|
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
|
|
|
|
unsigned int length)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
2012-05-03 02:00:54 +08:00
|
|
|
struct inode *inode = page->mapping->host;
|
2008-01-25 05:13:08 +08:00
|
|
|
struct extent_io_tree *tree;
|
2008-07-18 00:53:50 +08:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-04 03:33:23 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-18 00:53:50 +08:00
|
|
|
u64 page_start = page_offset(page);
|
|
|
|
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
|
2013-11-20 06:29:35 +08:00
|
|
|
int inode_evicting = inode->i_state & I_FREEING;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-09-03 04:53:46 +08:00
|
|
|
/*
|
|
|
|
* we have the page locked, so new writeback can't start,
|
|
|
|
* and the dirty bit won't be cleared while we are here.
|
|
|
|
*
|
|
|
|
* Wait for IO on this page so that we can safely clear
|
|
|
|
* the PagePrivate2 bit and do ordered accounting
|
|
|
|
*/
|
2008-07-18 00:53:50 +08:00
|
|
|
wait_on_page_writeback(page);
|
2009-09-03 04:53:46 +08:00
|
|
|
|
2012-05-03 02:00:54 +08:00
|
|
|
tree = &BTRFS_I(inode)->io_tree;
|
2008-07-18 00:53:50 +08:00
|
|
|
if (offset) {
|
|
|
|
btrfs_releasepage(page, GFP_NOFS);
|
|
|
|
return;
|
|
|
|
}
|
2013-11-20 06:29:35 +08:00
|
|
|
|
|
|
|
if (!inode_evicting)
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(tree, page_start, page_end, &cached_state);
|
2013-11-20 06:29:35 +08:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
2008-07-18 00:53:50 +08:00
|
|
|
if (ordered) {
|
2008-07-18 01:53:27 +08:00
|
|
|
/*
|
|
|
|
* IO on this page will never be started, so we need
|
|
|
|
* to account for any ordered extents now
|
|
|
|
*/
|
2013-11-20 06:29:35 +08:00
|
|
|
if (!inode_evicting)
|
|
|
|
clear_extent_bit(tree, page_start, page_end,
|
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
|
|
|
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DEFRAG, 1, 0, &cached_state,
|
|
|
|
GFP_NOFS);
|
2009-09-03 04:53:46 +08:00
|
|
|
/*
|
|
|
|
* whoever cleared the private bit is responsible
|
|
|
|
* for the finish_ordered_io
|
|
|
|
*/
|
2013-08-30 01:57:21 +08:00
|
|
|
if (TestClearPagePrivate2(page)) {
|
|
|
|
struct btrfs_ordered_inode_tree *tree;
|
|
|
|
u64 new_len;
|
|
|
|
|
|
|
|
tree = &BTRFS_I(inode)->ordered_tree;
|
|
|
|
|
|
|
|
spin_lock_irq(&tree->lock);
|
|
|
|
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
|
|
|
|
new_len = page_start - ordered->file_offset;
|
|
|
|
if (new_len < ordered->truncated_len)
|
|
|
|
ordered->truncated_len = new_len;
|
|
|
|
spin_unlock_irq(&tree->lock);
|
|
|
|
|
|
|
|
if (btrfs_dec_test_ordered_pending(inode, &ordered,
|
|
|
|
page_start,
|
|
|
|
PAGE_CACHE_SIZE, 1))
|
|
|
|
btrfs_finish_ordered_io(ordered);
|
2009-09-03 04:53:46 +08:00
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
2013-11-20 06:29:35 +08:00
|
|
|
if (!inode_evicting) {
|
|
|
|
cached_state = NULL;
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(tree, page_start, page_end,
|
2013-11-20 06:29:35 +08:00
|
|
|
&cached_state);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-29 10:35:16 +08:00
|
|
|
/*
|
|
|
|
* Qgroup reserved space handler
|
|
|
|
* Page here will be either
|
|
|
|
* 1) Already written to disk
|
|
|
|
* In this case, its reserved space is released from data rsv map
|
|
|
|
* and will be freed by delayed_ref handler finally.
|
|
|
|
* So even we call qgroup_free_data(), it won't decrease reserved
|
|
|
|
* space.
|
|
|
|
* 2) Not written to disk
|
|
|
|
* This means the reserved space should be freed here.
|
|
|
|
*/
|
|
|
|
btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
|
2013-11-20 06:29:35 +08:00
|
|
|
if (!inode_evicting) {
|
|
|
|
clear_extent_bit(tree, page_start, page_end,
|
|
|
|
EXTENT_LOCKED | EXTENT_DIRTY |
|
|
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
|
|
|
|
EXTENT_DEFRAG, 1, 1,
|
|
|
|
&cached_state, GFP_NOFS);
|
|
|
|
|
|
|
|
__btrfs_releasepage(page, GFP_NOFS);
|
2008-07-18 00:53:50 +08:00
|
|
|
}
|
|
|
|
|
2008-07-21 22:29:44 +08:00
|
|
|
ClearPageChecked(page);
|
2008-04-19 04:11:30 +08:00
|
|
|
if (PagePrivate(page)) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
|
|
|
page_cache_release(page);
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2007-06-16 01:50:00 +08:00
|
|
|
/*
|
|
|
|
* btrfs_page_mkwrite() is not allowed to change the file size as it gets
|
|
|
|
* called from a page fault handler when a page is first dirtied. Hence we must
|
|
|
|
* be careful to check for EOF conditions here. We set the page up correctly
|
|
|
|
* for a written page which means we get ENOSPC checking when writing into
|
|
|
|
* holes and correct delalloc and unwritten extent mapping on filesystems that
|
|
|
|
* support these features.
|
|
|
|
*
|
|
|
|
* We are not allowed to take the i_mutex here so we have to play games to
|
|
|
|
* protect against truncate races as the page could now be beyond EOF. Because
|
|
|
|
* vmtruncate() writes the inode size before removing pages, once we have the
|
|
|
|
* page lock we can determine safely if the page is beyond EOF. If it is not
|
|
|
|
* beyond EOF, then the page is guaranteed safe against truncation until we
|
|
|
|
* unlock the page.
|
|
|
|
*/
|
2009-04-01 06:23:21 +08:00
|
|
|
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
2007-06-16 01:50:00 +08:00
|
|
|
{
|
2009-04-01 06:23:21 +08:00
|
|
|
struct page *page = vmf->page;
|
2013-01-24 06:07:38 +08:00
|
|
|
struct inode *inode = file_inode(vma->vm_file);
|
2007-12-22 05:27:21 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-07-18 00:53:50 +08:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-04 03:33:23 +08:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-18 00:53:50 +08:00
|
|
|
char *kaddr;
|
|
|
|
unsigned long zero_start;
|
2007-06-16 01:50:00 +08:00
|
|
|
loff_t size;
|
2007-12-22 05:27:21 +08:00
|
|
|
int ret;
|
2012-01-26 02:47:40 +08:00
|
|
|
int reserved = 0;
|
2007-08-28 04:49:44 +08:00
|
|
|
u64 page_start;
|
2008-07-18 00:53:50 +08:00
|
|
|
u64 page_end;
|
2007-06-16 01:50:00 +08:00
|
|
|
|
2012-06-12 22:20:45 +08:00
|
|
|
sb_start_pagefault(inode->i_sb);
|
2015-09-08 17:25:54 +08:00
|
|
|
page_start = page_offset(page);
|
|
|
|
page_end = page_start + PAGE_CACHE_SIZE - 1;
|
|
|
|
|
2015-09-08 17:25:55 +08:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, page_start,
|
|
|
|
PAGE_CACHE_SIZE);
|
2012-01-26 02:47:40 +08:00
|
|
|
if (!ret) {
|
2012-03-26 21:46:47 +08:00
|
|
|
ret = file_update_time(vma->vm_file);
|
2012-01-26 02:47:40 +08:00
|
|
|
reserved = 1;
|
|
|
|
}
|
2009-04-01 06:23:23 +08:00
|
|
|
if (ret) {
|
|
|
|
if (ret == -ENOMEM)
|
|
|
|
ret = VM_FAULT_OOM;
|
|
|
|
else /* -ENOSPC, -EIO, etc */
|
|
|
|
ret = VM_FAULT_SIGBUS;
|
2012-01-26 02:47:40 +08:00
|
|
|
if (reserved)
|
|
|
|
goto out;
|
|
|
|
goto out_noreserve;
|
2009-04-01 06:23:23 +08:00
|
|
|
}
|
2007-12-22 05:27:21 +08:00
|
|
|
|
2009-04-01 06:23:23 +08:00
|
|
|
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
|
2008-07-18 00:53:50 +08:00
|
|
|
again:
|
2007-06-16 01:50:00 +08:00
|
|
|
lock_page(page);
|
|
|
|
size = i_size_read(inode);
|
2007-08-28 04:49:44 +08:00
|
|
|
|
2007-06-16 01:50:00 +08:00
|
|
|
if ((page->mapping != inode->i_mapping) ||
|
2008-07-18 00:53:50 +08:00
|
|
|
(page_start >= size)) {
|
2007-06-16 01:50:00 +08:00
|
|
|
/* page got truncated out from underneath us */
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
wait_on_page_writeback(page);
|
|
|
|
|
2015-12-03 21:30:40 +08:00
|
|
|
lock_extent_bits(io_tree, page_start, page_end, &cached_state);
|
2008-07-18 00:53:50 +08:00
|
|
|
set_page_extent_mapped(page);
|
|
|
|
|
2008-07-18 01:53:27 +08:00
|
|
|
/*
|
|
|
|
* we can't set the delalloc bits if there are pending ordered
|
|
|
|
* extents. Drop our locks and wait for them to finish
|
|
|
|
*/
|
2008-07-18 00:53:50 +08:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
|
|
|
if (ordered) {
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-18 00:53:50 +08:00
|
|
|
unlock_page(page);
|
2008-07-18 01:53:27 +08:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-07-18 00:53:50 +08:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2009-10-02 05:10:23 +08:00
|
|
|
/*
|
|
|
|
* XXX - page_mkwrite gets called every time the page is dirtied, even
|
|
|
|
* if it was already dirty, so for space accounting reasons we need to
|
|
|
|
* clear any delalloc bits for the range we are fixing to save. There
|
|
|
|
* is probably a better way to do this, but for now keep consistent with
|
|
|
|
* prepare_pages in the normal write path.
|
|
|
|
*/
|
2010-02-04 03:33:23 +08:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2012-09-06 09:10:51 +08:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
|
|
|
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
|
2010-02-04 03:33:23 +08:00
|
|
|
0, 0, &cached_state, GFP_NOFS);
|
2009-10-02 05:10:23 +08:00
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
|
|
|
|
&cached_state);
|
2009-09-12 04:12:44 +08:00
|
|
|
if (ret) {
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2009-09-12 04:12:44 +08:00
|
|
|
ret = VM_FAULT_SIGBUS;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2008-07-18 00:53:50 +08:00
|
|
|
ret = 0;
|
2007-06-16 01:50:00 +08:00
|
|
|
|
|
|
|
/* page is wholly or partially inside EOF */
|
2007-08-28 04:49:44 +08:00
|
|
|
if (page_start + PAGE_CACHE_SIZE > size)
|
2008-07-18 00:53:50 +08:00
|
|
|
zero_start = size & ~PAGE_CACHE_MASK;
|
2007-06-16 01:50:00 +08:00
|
|
|
else
|
2008-07-18 00:53:50 +08:00
|
|
|
zero_start = PAGE_CACHE_SIZE;
|
2007-06-16 01:50:00 +08:00
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
if (zero_start != PAGE_CACHE_SIZE) {
|
|
|
|
kaddr = kmap(page);
|
|
|
|
memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
2008-07-18 00:53:51 +08:00
|
|
|
ClearPageChecked(page);
|
2008-07-18 00:53:50 +08:00
|
|
|
set_page_dirty(page);
|
2009-09-12 00:33:12 +08:00
|
|
|
SetPageUptodate(page);
|
2009-04-01 01:27:11 +08:00
|
|
|
|
2009-10-14 01:21:08 +08:00
|
|
|
BTRFS_I(inode)->last_trans = root->fs_info->generation;
|
|
|
|
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
|
2012-08-29 15:07:55 +08:00
|
|
|
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
|
2009-10-14 01:21:08 +08:00
|
|
|
|
2010-02-04 03:33:23 +08:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
|
2007-06-16 01:50:00 +08:00
|
|
|
|
|
|
|
out_unlock:
|
2012-06-12 22:20:45 +08:00
|
|
|
if (!ret) {
|
|
|
|
sb_end_pagefault(inode->i_sb);
|
2009-09-12 00:33:12 +08:00
|
|
|
return VM_FAULT_LOCKED;
|
2012-06-12 22:20:45 +08:00
|
|
|
}
|
2007-06-16 01:50:00 +08:00
|
|
|
unlock_page(page);
|
2007-12-22 05:27:21 +08:00
|
|
|
out:
|
2015-09-08 17:25:55 +08:00
|
|
|
btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
|
2012-01-26 02:47:40 +08:00
|
|
|
out_noreserve:
|
2012-06-12 22:20:45 +08:00
|
|
|
sb_end_pagefault(inode->i_sb);
|
2007-06-16 01:50:00 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-02-01 04:30:16 +08:00
|
|
|
static int btrfs_truncate(struct inode *inode)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-05-03 22:40:22 +08:00
|
|
|
struct btrfs_block_rsv *rsv;
|
2013-06-18 05:14:39 +08:00
|
|
|
int ret = 0;
|
2011-02-01 05:03:11 +08:00
|
|
|
int err = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
2008-07-18 00:54:05 +08:00
|
|
|
u64 mask = root->sectorsize - 1;
|
2011-08-19 22:29:59 +08:00
|
|
|
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2013-10-26 04:13:35 +08:00
|
|
|
ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
|
|
|
|
(u64)-1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2011-05-03 22:40:22 +08:00
|
|
|
/*
|
|
|
|
* Yes ladies and gentelment, this is indeed ugly. The fact is we have
|
|
|
|
* 3 things going on here
|
|
|
|
*
|
|
|
|
* 1) We need to reserve space for our orphan item and the space to
|
|
|
|
* delete our orphan item. Lord knows we don't want to have a dangling
|
|
|
|
* orphan item because we didn't reserve space to remove it.
|
|
|
|
*
|
|
|
|
* 2) We need to reserve space to update our inode.
|
|
|
|
*
|
|
|
|
* 3) We need to have something to cache all the space that is going to
|
|
|
|
* be free'd up by the truncate operation, but also have some slack
|
|
|
|
* space reserved in case it uses space during the truncate (thank you
|
|
|
|
* very much snapshotting).
|
|
|
|
*
|
|
|
|
* And we need these to all be seperate. The fact is we can use alot of
|
|
|
|
* space doing the truncate, and we have no earthly idea how much space
|
|
|
|
* we will use, so we need the truncate reservation to be seperate so it
|
|
|
|
* doesn't end up using space reserved for updating the inode or
|
|
|
|
* removing the orphan item. We also need to be able to stop the
|
|
|
|
* transaction and start a new one, which means we need to be able to
|
|
|
|
* update the inode several times, and we have no idea of knowing how
|
|
|
|
* many times that will be, so we can't just reserve 1 item for the
|
|
|
|
* entirety of the opration, so that has to be done seperately as well.
|
|
|
|
* Then there is the orphan item, which does indeed need to be held on
|
|
|
|
* to for the whole operation, and we need nobody to touch this reserved
|
|
|
|
* space except the orphan code.
|
|
|
|
*
|
|
|
|
* So that leaves us with
|
|
|
|
*
|
|
|
|
* 1) root->orphan_block_rsv - for the orphan deletion.
|
|
|
|
* 2) rsv - for the truncate reservation, which we will steal from the
|
|
|
|
* transaction reservation.
|
|
|
|
* 3) fs_info->trans_block_rsv - this will have 1 items worth left for
|
|
|
|
* updating the inode.
|
|
|
|
*/
|
2012-09-06 18:02:28 +08:00
|
|
|
rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
|
2011-05-03 22:40:22 +08:00
|
|
|
if (!rsv)
|
|
|
|
return -ENOMEM;
|
2011-08-29 23:01:31 +08:00
|
|
|
rsv->size = min_size;
|
2012-08-28 05:48:15 +08:00
|
|
|
rsv->failfast = 1;
|
2011-03-05 03:37:08 +08:00
|
|
|
|
2011-08-09 01:46:15 +08:00
|
|
|
/*
|
2011-08-19 22:29:59 +08:00
|
|
|
* 1 for the truncate slack space
|
2011-08-09 01:46:15 +08:00
|
|
|
* 1 for updating the inode.
|
|
|
|
*/
|
2013-01-08 06:03:21 +08:00
|
|
|
trans = btrfs_start_transaction(root, 2);
|
2011-05-03 22:40:22 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
2011-03-05 03:37:08 +08:00
|
|
|
|
2011-08-09 01:46:15 +08:00
|
|
|
/* Migrate the slack space for the truncate to our reserve */
|
|
|
|
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
|
|
|
|
min_size);
|
2011-05-03 22:40:22 +08:00
|
|
|
BUG_ON(ret);
|
2011-03-05 03:37:08 +08:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
/*
|
|
|
|
* So if we truncate and then write and fsync we normally would just
|
|
|
|
* write the extents that changed, which is a problem if we need to
|
|
|
|
* first truncate that entire inode. So set this flag so we write out
|
|
|
|
* all of the extents in the inode to the sync log so we're completely
|
|
|
|
* safe.
|
|
|
|
*/
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
|
2012-08-28 05:48:15 +08:00
|
|
|
trans->block_rsv = rsv;
|
2011-08-09 01:46:15 +08:00
|
|
|
|
2009-11-12 17:35:36 +08:00
|
|
|
while (1) {
|
|
|
|
ret = btrfs_truncate_inode_items(trans, root, inode,
|
|
|
|
inode->i_size,
|
|
|
|
BTRFS_EXTENT_DATA_KEY);
|
2014-12-18 01:41:04 +08:00
|
|
|
if (ret != -ENOSPC && ret != -EAGAIN) {
|
2011-02-01 05:03:11 +08:00
|
|
|
err = ret;
|
2009-11-12 17:35:36 +08:00
|
|
|
break;
|
2011-02-01 05:03:11 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2011-05-03 22:40:22 +08:00
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
2009-11-12 17:35:36 +08:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2011-02-01 05:03:11 +08:00
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
break;
|
|
|
|
}
|
2012-08-28 05:48:15 +08:00
|
|
|
|
2009-11-12 17:35:36 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2012-08-28 05:48:15 +08:00
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = err = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
|
|
|
|
rsv, min_size);
|
|
|
|
BUG_ON(ret); /* shouldn't happen */
|
|
|
|
trans->block_rsv = rsv;
|
2009-11-12 17:35:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0 && inode->i_nlink > 0) {
|
2011-05-03 22:40:22 +08:00
|
|
|
trans->block_rsv = root->orphan_block_rsv;
|
2009-11-12 17:35:36 +08:00
|
|
|
ret = btrfs_orphan_del(trans, inode);
|
2011-02-01 05:03:11 +08:00
|
|
|
if (ret)
|
|
|
|
err = ret;
|
2009-11-12 17:35:36 +08:00
|
|
|
}
|
|
|
|
|
2011-11-09 03:49:59 +08:00
|
|
|
if (trans) {
|
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
2008-07-25 00:17:14 +08:00
|
|
|
|
2012-01-13 08:10:12 +08:00
|
|
|
ret = btrfs_end_transaction(trans, root);
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2011-11-09 03:49:59 +08:00
|
|
|
}
|
2011-05-03 22:40:22 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_block_rsv(root, rsv);
|
|
|
|
|
2011-02-01 05:03:11 +08:00
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
2011-02-01 04:30:16 +08:00
|
|
|
|
2011-02-01 05:03:11 +08:00
|
|
|
return err;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* create a new subvolume directory/inode (helper for the ioctl).
|
|
|
|
*/
|
2008-12-12 05:30:39 +08:00
|
|
|
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
struct btrfs_root *new_root,
|
|
|
|
struct btrfs_root *parent_root,
|
|
|
|
u64 new_dirid)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2009-09-22 04:00:26 +08:00
|
|
|
int err;
|
2008-08-05 23:18:09 +08:00
|
|
|
u64 index = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2012-02-11 05:15:54 +08:00
|
|
|
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
|
|
|
|
new_dirid, new_dirid,
|
|
|
|
S_IFDIR | (~current_umask() & S_IRWXUGO),
|
|
|
|
&index);
|
2007-06-23 02:16:25 +08:00
|
|
|
if (IS_ERR(inode))
|
2008-06-12 09:53:53 +08:00
|
|
|
return PTR_ERR(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
|
2011-10-28 20:13:29 +08:00
|
|
|
set_nlink(inode, 1);
|
2008-07-18 00:54:05 +08:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2014-09-09 04:08:51 +08:00
|
|
|
unlock_new_inode(inode);
|
2008-06-10 09:57:42 +08:00
|
|
|
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
|
|
|
|
if (err)
|
|
|
|
btrfs_err(new_root->fs_info,
|
2014-05-15 22:48:20 +08:00
|
|
|
"error inheriting subvolume %llu properties: %d",
|
Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for
inodes. Properties are name/value pairs that can be associated with
inodes for different purposes. They are stored as xattrs with the
prefix "btrfs."
Properties can be inherited - this means when a directory inode has
inheritable properties set, these are added to new inodes created
under that directory. Further, subvolumes can also have properties
associated with them, and they can be inherited from their parent
subvolume. Naturally, directory properties have priority over subvolume
properties (in practice a subvolume property is just a regular
property associated with the root inode, objectid 256, of the
subvolume's fs tree).
This change also adds one specific property implementation, named
"compression", whose values can be "lzo" or "zlib" and it's an
inheritable property.
The corresponding changes to btrfs-progs were also implemented.
A patch with xfstests for this feature will follow once there's
agreement on this change/feature.
Further, the script at the bottom of this commit message was used to
do some benchmarks to measure any performance penalties of this feature.
Basically the tests correspond to:
Test 1 - create a filesystem and mount it with compress-force=lzo,
then sequentially create N files of 64Kb each, measure how long it took
to create the files, unmount the filesystem, mount the filesystem and
perform an 'ls -lha' against the test directory holding the N files, and
report the time the command took.
Test 2 - create a filesystem and don't use any compression option when
mounting it - instead set the compression property of the subvolume's
root to 'lzo'. Then create N files of 64Kb, and report the time it took.
The unmount the filesystem, mount it again and perform an 'ls -lha' like
in the former test. This means every single file ends up with a property
(xattr) associated to it.
Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the
compression property, have no real effect other than adding more work
when inheriting properties and taking more btree leaf space.
Test 4 - same as test 3 but with 10 properties per file.
Results (in seconds, and averages of 5 runs each), for different N
numbers of files follow.
* Without properties (test 1)
file creation time ls -lha time
10 000 files 3.49 0.76
100 000 files 47.19 8.37
1 000 000 files 518.51 107.06
* With 1 property (compression property set to lzo - test 2)
file creation time ls -lha time
10 000 files 3.63 0.93
100 000 files 48.56 9.74
1 000 000 files 537.72 125.11
* With 4 properties (test 3)
file creation time ls -lha time
10 000 files 3.94 1.20
100 000 files 52.14 11.48
1 000 000 files 572.70 142.13
* With 10 properties (test 4)
file creation time ls -lha time
10 000 files 4.61 1.35
100 000 files 58.86 13.83
1 000 000 files 656.01 177.61
The increased latencies with properties are essencialy because of:
*) When creating an inode, we now synchronously write 1 more item
(an xattr item) for each property inherited from the parent dir
(or subvolume). This could be done in an asynchronous way such
as we do for dir intex items (delayed-inode.c), which could help
reduce the file creation latency;
*) With properties, we now have larger fs trees. For this particular
test each xattr item uses 75 bytes of leaf space in the fs tree.
This could be less by using a new item for xattr items, instead of
the current btrfs_dir_item, since we could cut the 'location' and
'type' fields (saving 18 bytes) and maybe 'transid' too (saving a
total of 26 bytes per xattr item) from the btrfs_dir_item type.
Also tried batching the xattr insertions (ignoring proper hash
collision handling, since it didn't exist) when creating files that
inherit properties from their parent inode/subvolume, but the end
results were (surprisingly) essentially the same.
Test script:
$ cat test.pl
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
use constant NUM_FILES => 10_000;
use constant FILE_SIZES => (64 * 1024);
use constant DEV => '/dev/sdb4';
use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev';
use constant TEST_DIR => (MNT_POINT . '/testdir');
system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!";
# following line for testing without properties
#system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!";
# following 2 lines for testing with properties
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!";
system("mkdir", TEST_DIR) == 0 or die "mkdir failed!";
my ($t1, $t2);
$t1 = time();
for (my $i = 1; $i <= NUM_FILES; $i++) {
my $p = TEST_DIR . '/file_' . $i;
open(my $f, '>', $p) or die "Error opening file!";
$f->autoflush(1);
for (my $j = 0; $j < FILE_SIZES; $j += 4096) {
print $f ('A' x 4096) or die "Error writing to file!";
}
close($f);
}
$t2 = time();
print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
system("mount", DEV, MNT_POINT) == 0 or die "mount failed!";
$t1 = time();
system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!";
$t2 = time();
print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n";
system("umount", DEV) == 0 or die "umount failed!";
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
|
|
|
new_root->root_key.objectid, err);
|
|
|
|
|
2009-09-22 04:00:26 +08:00
|
|
|
err = btrfs_update_inode(trans, new_root, inode);
|
2008-10-10 01:39:39 +08:00
|
|
|
|
2009-09-22 04:00:26 +08:00
|
|
|
iput(inode);
|
2011-07-27 02:32:23 +08:00
|
|
|
return err;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
struct inode *btrfs_alloc_inode(struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct btrfs_inode *ei;
|
2010-05-16 22:46:25 +08:00
|
|
|
struct inode *inode;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
|
|
|
|
if (!ei)
|
|
|
|
return NULL;
|
2010-05-16 22:46:25 +08:00
|
|
|
|
|
|
|
ei->root = NULL;
|
|
|
|
ei->generation = 0;
|
2007-08-11 04:22:09 +08:00
|
|
|
ei->last_trans = 0;
|
2009-10-14 01:21:08 +08:00
|
|
|
ei->last_sub_trans = 0;
|
2008-09-06 04:13:11 +08:00
|
|
|
ei->logged_trans = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
ei->delalloc_bytes = 0;
|
2014-07-03 18:22:07 +08:00
|
|
|
ei->defrag_bytes = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
ei->disk_i_size = 0;
|
|
|
|
ei->flags = 0;
|
2011-08-04 22:25:02 +08:00
|
|
|
ei->csum_bytes = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
ei->index_cnt = (u64)-1;
|
2013-12-26 13:07:06 +08:00
|
|
|
ei->dir_index = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
ei->last_unlink_trans = 0;
|
2012-08-29 15:07:55 +08:00
|
|
|
ei->last_log_commit = 0;
|
2015-11-19 21:15:51 +08:00
|
|
|
ei->delayed_iput_count = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2011-07-15 23:16:44 +08:00
|
|
|
spin_lock_init(&ei->lock);
|
|
|
|
ei->outstanding_extents = 0;
|
|
|
|
ei->reserved_extents = 0;
|
2010-05-16 22:46:25 +08:00
|
|
|
|
2012-05-24 02:13:11 +08:00
|
|
|
ei->runtime_flags = 0;
|
2010-12-17 14:21:50 +08:00
|
|
|
ei->force_compress = BTRFS_COMPRESS_NONE;
|
2010-05-16 22:46:25 +08:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
|
|
|
ei->delayed_node = NULL;
|
|
|
|
|
2012-07-04 15:18:07 +08:00
|
|
|
ei->i_otime.tv_sec = 0;
|
|
|
|
ei->i_otime.tv_nsec = 0;
|
|
|
|
|
2010-05-16 22:46:25 +08:00
|
|
|
inode = &ei->vfs_inode;
|
2011-04-21 06:34:43 +08:00
|
|
|
extent_map_tree_init(&ei->extent_tree);
|
2011-04-21 05:35:57 +08:00
|
|
|
extent_io_tree_init(&ei->io_tree, &inode->i_data);
|
|
|
|
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
|
2012-03-13 21:38:00 +08:00
|
|
|
ei->io_tree.track_uptodate = 1;
|
|
|
|
ei->io_failure_tree.track_uptodate = 1;
|
2012-11-17 02:56:32 +08:00
|
|
|
atomic_set(&ei->sync_writers, 0);
|
2010-05-16 22:46:25 +08:00
|
|
|
mutex_init(&ei->log_mutex);
|
2012-01-14 01:09:22 +08:00
|
|
|
mutex_init(&ei->delalloc_mutex);
|
2008-07-18 00:53:50 +08:00
|
|
|
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
|
2010-05-16 22:46:25 +08:00
|
|
|
INIT_LIST_HEAD(&ei->delalloc_inodes);
|
2015-11-19 21:15:51 +08:00
|
|
|
INIT_LIST_HEAD(&ei->delayed_iput);
|
2010-05-16 22:46:25 +08:00
|
|
|
RB_CLEAR_NODE(&ei->rb_node);
|
|
|
|
|
|
|
|
return inode;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2013-10-12 02:44:09 +08:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
void btrfs_test_destroy_inode(struct inode *inode)
|
|
|
|
{
|
|
|
|
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
|
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-01-07 14:49:49 +08:00
|
|
|
static void btrfs_i_callback(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
}
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
void btrfs_destroy_inode(struct inode *inode)
|
|
|
|
{
|
2008-07-18 00:53:50 +08:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2009-04-01 01:27:11 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
2012-06-10 01:51:19 +08:00
|
|
|
WARN_ON(!hlist_empty(&inode->i_dentry));
|
2007-06-12 18:35:45 +08:00
|
|
|
WARN_ON(inode->i_data.nrpages);
|
2011-07-15 23:16:44 +08:00
|
|
|
WARN_ON(BTRFS_I(inode)->outstanding_extents);
|
|
|
|
WARN_ON(BTRFS_I(inode)->reserved_extents);
|
2011-08-04 22:25:02 +08:00
|
|
|
WARN_ON(BTRFS_I(inode)->delalloc_bytes);
|
|
|
|
WARN_ON(BTRFS_I(inode)->csum_bytes);
|
2014-07-03 18:22:07 +08:00
|
|
|
WARN_ON(BTRFS_I(inode)->defrag_bytes);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2009-11-12 04:53:34 +08:00
|
|
|
/*
|
|
|
|
* This can happen where we create an inode, but somebody else also
|
|
|
|
* created the same inode and we need to destroy the one we already
|
|
|
|
* created.
|
|
|
|
*/
|
|
|
|
if (!root)
|
|
|
|
goto free;
|
|
|
|
|
2012-05-24 02:26:42 +08:00
|
|
|
if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
|
|
|
|
&BTRFS_I(inode)->runtime_flags)) {
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_info(root->fs_info, "inode %llu still on the orphan list",
|
2013-08-20 19:20:07 +08:00
|
|
|
btrfs_ino(inode));
|
2012-05-24 02:26:42 +08:00
|
|
|
atomic_dec(&root->orphan_inodes);
|
2008-07-25 00:17:14 +08:00
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
while (1) {
|
2008-07-18 00:53:50 +08:00
|
|
|
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
|
|
|
|
if (!ordered)
|
|
|
|
break;
|
|
|
|
else {
|
2013-03-20 06:41:23 +08:00
|
|
|
btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
|
2013-08-20 19:20:07 +08:00
|
|
|
ordered->file_offset, ordered->len);
|
2008-07-18 00:53:50 +08:00
|
|
|
btrfs_remove_ordered_extent(inode, ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
|
|
|
}
|
2015-10-13 09:53:10 +08:00
|
|
|
btrfs_qgroup_check_reserved_leak(inode);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
|
|
|
inode_tree_del(inode);
|
2008-09-26 22:05:38 +08:00
|
|
|
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
|
2009-11-12 04:53:34 +08:00
|
|
|
free:
|
2011-01-07 14:49:49 +08:00
|
|
|
call_rcu(&inode->i_rcu, btrfs_i_callback);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
2010-06-08 01:43:19 +08:00
|
|
|
int btrfs_drop_inode(struct inode *inode)
|
2009-09-22 04:00:26 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-06-08 01:43:19 +08:00
|
|
|
|
2013-06-06 17:56:34 +08:00
|
|
|
if (root == NULL)
|
|
|
|
return 1;
|
|
|
|
|
Btrfs: fix cleaner thread not working with inode cache option
Right now inode cache inode is treated as the same as space cache
inode, ie. keep inode in memory till putting super.
But this leads to an awkward situation.
If we're going to delete a snapshot/subvolume, btrfs will not
actually delete it and return free space, but will add it to dead
roots list until the last inode on this snap/subvol being destroyed.
Then we'll fetch deleted roots and cleanup them via cleaner thread.
So here is the problem, if we enable inode cache option, each
snap/subvol has a cached inode which is used to store inode allcation
information. And this cache inode will be kept in memory, as the above
said. So with inode cache, snap/subvol can only be added into
dead roots list during freeing roots stage in umount, so that we can
ONLY get space back after another remount(we cleanup dead roots on mount).
But the real thing is we'll no more use the snap/subvol if we mark it
deleted, so we can safely iput its cache inode when we delete snap/subvol.
Another thing is that we need to change the rules of droping inode, we
don't keep snap/subvol's cache inode in memory till end so that we can
add snap/subvol into dead roots list in time.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-02-20 22:10:23 +08:00
|
|
|
/* the snap/subvol tree is on deleting */
|
2013-09-05 22:58:43 +08:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
2010-06-08 01:43:19 +08:00
|
|
|
return 1;
|
2009-09-22 04:00:26 +08:00
|
|
|
else
|
2010-06-08 01:43:19 +08:00
|
|
|
return generic_drop_inode(inode);
|
2009-09-22 04:00:26 +08:00
|
|
|
}
|
|
|
|
|
2008-07-31 04:54:26 +08:00
|
|
|
static void init_once(void *foo)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
|
|
|
|
|
|
|
|
inode_init_once(&ei->vfs_inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_destroy_cachep(void)
|
|
|
|
{
|
2012-09-26 09:33:07 +08:00
|
|
|
/*
|
|
|
|
* Make sure all delayed rcu free inodes are flushed before we
|
|
|
|
* destroy cache.
|
|
|
|
*/
|
|
|
|
rcu_barrier();
|
2007-06-12 18:35:45 +08:00
|
|
|
if (btrfs_inode_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_inode_cachep);
|
|
|
|
if (btrfs_trans_handle_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_trans_handle_cachep);
|
|
|
|
if (btrfs_transaction_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_transaction_cachep);
|
|
|
|
if (btrfs_path_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_path_cachep);
|
2011-01-29 06:05:48 +08:00
|
|
|
if (btrfs_free_space_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_free_space_cachep);
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_init_cachep(void)
|
|
|
|
{
|
2012-09-07 17:00:48 +08:00
|
|
|
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
|
2009-04-13 21:33:09 +08:00
|
|
|
sizeof(struct btrfs_inode), 0,
|
2016-01-15 07:18:21 +08:00
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
|
|
|
|
init_once);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (!btrfs_inode_cachep)
|
|
|
|
goto fail;
|
2009-04-13 21:33:09 +08:00
|
|
|
|
2012-09-07 17:00:48 +08:00
|
|
|
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
|
2009-04-13 21:33:09 +08:00
|
|
|
sizeof(struct btrfs_trans_handle), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (!btrfs_trans_handle_cachep)
|
|
|
|
goto fail;
|
2009-04-13 21:33:09 +08:00
|
|
|
|
2012-09-07 17:00:48 +08:00
|
|
|
btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
|
2009-04-13 21:33:09 +08:00
|
|
|
sizeof(struct btrfs_transaction), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (!btrfs_transaction_cachep)
|
|
|
|
goto fail;
|
2009-04-13 21:33:09 +08:00
|
|
|
|
2012-09-07 17:00:48 +08:00
|
|
|
btrfs_path_cachep = kmem_cache_create("btrfs_path",
|
2009-04-13 21:33:09 +08:00
|
|
|
sizeof(struct btrfs_path), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (!btrfs_path_cachep)
|
|
|
|
goto fail;
|
2009-04-13 21:33:09 +08:00
|
|
|
|
2012-09-07 17:00:48 +08:00
|
|
|
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
|
2011-01-29 06:05:48 +08:00
|
|
|
sizeof(struct btrfs_free_space), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
|
|
|
if (!btrfs_free_space_cachep)
|
|
|
|
goto fail;
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
return 0;
|
|
|
|
fail:
|
|
|
|
btrfs_destroy_cachep();
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_getattr(struct vfsmount *mnt,
|
|
|
|
struct dentry *dentry, struct kstat *stat)
|
|
|
|
{
|
2013-01-29 18:11:59 +08:00
|
|
|
u64 delalloc_bytes;
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2011-11-20 20:33:38 +08:00
|
|
|
u32 blocksize = inode->i_sb->s_blocksize;
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
generic_fillattr(inode, stat);
|
2011-07-08 03:44:25 +08:00
|
|
|
stat->dev = BTRFS_I(inode)->root->anon_dev;
|
2008-01-04 03:51:00 +08:00
|
|
|
stat->blksize = PAGE_CACHE_SIZE;
|
2013-01-29 18:11:59 +08:00
|
|
|
|
|
|
|
spin_lock(&BTRFS_I(inode)->lock);
|
|
|
|
delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
|
|
|
|
spin_unlock(&BTRFS_I(inode)->lock);
|
2011-11-20 20:33:38 +08:00
|
|
|
stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
|
2013-01-29 18:11:59 +08:00
|
|
|
ALIGN(delalloc_bytes, blocksize)) >> 9;
|
2007-06-12 18:35:45 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-01-06 10:25:51 +08:00
|
|
|
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
|
|
|
struct inode *new_dir, struct dentry *new_dentry)
|
2007-06-12 18:35:45 +08:00
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(old_dir)->root;
|
2009-09-22 03:56:00 +08:00
|
|
|
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
|
2015-03-18 06:25:59 +08:00
|
|
|
struct inode *new_inode = d_inode(new_dentry);
|
|
|
|
struct inode *old_inode = d_inode(old_dentry);
|
2007-06-12 18:35:45 +08:00
|
|
|
struct timespec ctime = CURRENT_TIME;
|
2008-08-05 23:18:09 +08:00
|
|
|
u64 index = 0;
|
2009-09-22 03:56:00 +08:00
|
|
|
u64 root_objectid;
|
2007-06-12 18:35:45 +08:00
|
|
|
int ret;
|
2011-04-20 10:31:50 +08:00
|
|
|
u64 old_ino = btrfs_ino(old_inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2009-09-24 21:17:31 +08:00
|
|
|
return -EPERM;
|
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
/* we only allow rename subvolume link between subvolumes */
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
|
2008-11-18 09:42:26 +08:00
|
|
|
return -EXDEV;
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
|
|
|
|
(new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
|
2007-06-12 18:35:45 +08:00
|
|
|
return -ENOTEMPTY;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
if (S_ISDIR(old_inode->i_mode) && new_inode &&
|
|
|
|
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
|
|
|
return -ENOTEMPTY;
|
2012-12-18 03:26:57 +08:00
|
|
|
|
|
|
|
|
|
|
|
/* check for collisions, even if the name isn't there */
|
2013-10-10 00:24:04 +08:00
|
|
|
ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
|
2012-12-18 03:26:57 +08:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
if (ret == -EEXIST) {
|
|
|
|
/* we shouldn't get
|
|
|
|
* eexist without a new_inode */
|
2013-10-31 13:00:08 +08:00
|
|
|
if (WARN_ON(!new_inode)) {
|
2012-12-18 03:26:57 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* maybe -EOVERFLOW */
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
|
2009-04-01 01:27:11 +08:00
|
|
|
/*
|
2014-08-13 01:47:42 +08:00
|
|
|
* we're using rename to replace one file with another. Start IO on it
|
|
|
|
* now so we don't add too much work to the end of the transaction
|
2009-04-01 01:27:11 +08:00
|
|
|
*/
|
2014-08-13 01:47:42 +08:00
|
|
|
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
|
2009-04-01 01:27:11 +08:00
|
|
|
filemap_flush(old_inode->i_mapping);
|
|
|
|
|
2009-09-22 04:00:26 +08:00
|
|
|
/* close the racy window with snapshot create/destroy ioctl */
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2009-09-22 04:00:26 +08:00
|
|
|
down_read(&root->fs_info->subvol_sem);
|
2010-05-16 22:48:46 +08:00
|
|
|
/*
|
|
|
|
* We want to reserve the absolute worst case amount of items. So if
|
|
|
|
* both inodes are subvols and we need to unlink them then that would
|
|
|
|
* require 4 item modifications, but if they are both normal inodes it
|
|
|
|
* would require 5 item modifications, so we'll assume their normal
|
|
|
|
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
|
|
|
|
* should cover the worst case number of items we'll modify.
|
|
|
|
*/
|
2013-03-27 03:26:55 +08:00
|
|
|
trans = btrfs_start_transaction(root, 11);
|
2011-03-31 21:23:47 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out_notrans;
|
|
|
|
}
|
2009-09-22 04:00:26 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
if (dest != root)
|
|
|
|
btrfs_record_root_in_trans(trans, dest);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2009-09-24 21:17:31 +08:00
|
|
|
ret = btrfs_set_inode_index(new_dir, &index);
|
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2009-04-01 01:27:11 +08:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
BTRFS_I(old_inode)->dir_index = 0ULL;
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-22 03:56:00 +08:00
|
|
|
/* force full log commit if subvolume involved. */
|
2014-04-02 19:51:06 +08:00
|
|
|
btrfs_set_log_full_commit(root->fs_info, trans);
|
2009-09-22 03:56:00 +08:00
|
|
|
} else {
|
2009-09-24 21:17:31 +08:00
|
|
|
ret = btrfs_insert_inode_ref(trans, dest,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len,
|
2011-04-20 10:31:50 +08:00
|
|
|
old_ino,
|
|
|
|
btrfs_ino(new_dir), index);
|
2009-09-24 21:17:31 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2009-09-22 03:56:00 +08:00
|
|
|
/*
|
|
|
|
* this is an ugly little race, but the rename is required
|
|
|
|
* to make sure that if we crash, the inode is either at the
|
|
|
|
* old name or the new one. pinning the log transaction lets
|
|
|
|
* us make sure we don't allow a log commit to come in after
|
|
|
|
* we unlink the name but before we add the new name back in.
|
|
|
|
*/
|
|
|
|
btrfs_pin_log_trans(root);
|
|
|
|
}
|
2009-04-01 01:27:11 +08:00
|
|
|
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(old_dir);
|
|
|
|
inode_inc_iversion(new_dir);
|
|
|
|
inode_inc_iversion(old_inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
old_dir->i_ctime = old_dir->i_mtime = ctime;
|
|
|
|
new_dir->i_ctime = new_dir->i_mtime = ctime;
|
|
|
|
old_inode->i_ctime = ctime;
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2009-03-24 22:24:20 +08:00
|
|
|
if (old_dentry->d_parent != new_dentry->d_parent)
|
|
|
|
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-22 03:56:00 +08:00
|
|
|
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
|
|
|
|
ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
|
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
} else {
|
2011-03-05 01:14:37 +08:00
|
|
|
ret = __btrfs_unlink_inode(trans, root, old_dir,
|
2015-03-18 06:25:59 +08:00
|
|
|
d_inode(old_dentry),
|
2011-03-05 01:14:37 +08:00
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_update_inode(trans, root, old_inode);
|
2009-09-22 03:56:00 +08:00
|
|
|
}
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
if (new_inode) {
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(new_inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
new_inode->i_ctime = CURRENT_TIME;
|
2011-04-20 10:31:50 +08:00
|
|
|
if (unlikely(btrfs_ino(new_inode) ==
|
2009-09-22 03:56:00 +08:00
|
|
|
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
|
|
|
root_objectid = BTRFS_I(new_inode)->location.objectid;
|
|
|
|
ret = btrfs_unlink_subvol(trans, dest, new_dir,
|
|
|
|
root_objectid,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
BUG_ON(new_inode->i_nlink == 0);
|
|
|
|
} else {
|
|
|
|
ret = btrfs_unlink_inode(trans, dest, new_dir,
|
2015-03-18 06:25:59 +08:00
|
|
|
d_inode(new_dentry),
|
2009-09-22 03:56:00 +08:00
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
}
|
2013-08-14 02:10:08 +08:00
|
|
|
if (!ret && new_inode->i_nlink == 0)
|
2015-03-18 06:25:59 +08:00
|
|
|
ret = btrfs_orphan_add(trans, d_inode(new_dentry));
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-07-25 00:12:38 +08:00
|
|
|
|
2009-09-22 03:56:00 +08:00
|
|
|
ret = btrfs_add_link(trans, new_dir, old_inode,
|
|
|
|
new_dentry->d_name.name,
|
2009-09-24 21:17:31 +08:00
|
|
|
new_dentry->d_name.len, 0, index);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2013-12-26 13:07:06 +08:00
|
|
|
if (old_inode->i_nlink == 1)
|
|
|
|
BTRFS_I(old_inode)->dir_index = index;
|
|
|
|
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
|
2011-07-17 11:09:10 +08:00
|
|
|
struct dentry *parent = new_dentry->d_parent;
|
2010-11-20 17:48:00 +08:00
|
|
|
btrfs_log_new_name(trans, old_inode, old_dir, parent);
|
2009-09-22 03:56:00 +08:00
|
|
|
btrfs_end_log_trans(root);
|
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
out_fail:
|
2012-01-13 08:10:12 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2011-03-31 21:23:47 +08:00
|
|
|
out_notrans:
|
2011-04-20 10:31:50 +08:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2009-09-22 04:00:26 +08:00
|
|
|
up_read(&root->fs_info->subvol_sem);
|
2009-09-12 04:12:44 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-07-23 21:15:32 +08:00
|
|
|
static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
|
|
|
|
struct inode *new_dir, struct dentry *new_dentry,
|
|
|
|
unsigned int flags)
|
|
|
|
{
|
|
|
|
if (flags & ~RENAME_NOREPLACE)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
|
|
|
|
}
|
|
|
|
|
2012-10-25 17:28:04 +08:00
|
|
|
static void btrfs_run_delalloc_work(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct btrfs_delalloc_work *delalloc_work;
|
2013-10-29 03:03:41 +08:00
|
|
|
struct inode *inode;
|
2012-10-25 17:28:04 +08:00
|
|
|
|
|
|
|
delalloc_work = container_of(work, struct btrfs_delalloc_work,
|
|
|
|
work);
|
2013-10-29 03:03:41 +08:00
|
|
|
inode = delalloc_work->inode;
|
2015-11-28 02:27:11 +08:00
|
|
|
filemap_flush(inode->i_mapping);
|
|
|
|
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
|
|
|
|
&BTRFS_I(inode)->runtime_flags))
|
2013-10-29 03:03:41 +08:00
|
|
|
filemap_flush(inode->i_mapping);
|
2012-10-25 17:28:04 +08:00
|
|
|
|
|
|
|
if (delalloc_work->delay_iput)
|
2013-10-29 03:03:41 +08:00
|
|
|
btrfs_add_delayed_iput(inode);
|
2012-10-25 17:28:04 +08:00
|
|
|
else
|
2013-10-29 03:03:41 +08:00
|
|
|
iput(inode);
|
2012-10-25 17:28:04 +08:00
|
|
|
complete(&delalloc_work->completion);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
|
2015-11-28 02:24:16 +08:00
|
|
|
int delay_iput)
|
2012-10-25 17:28:04 +08:00
|
|
|
{
|
|
|
|
struct btrfs_delalloc_work *work;
|
|
|
|
|
2015-12-08 21:39:32 +08:00
|
|
|
work = kmalloc(sizeof(*work), GFP_NOFS);
|
2012-10-25 17:28:04 +08:00
|
|
|
if (!work)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
init_completion(&work->completion);
|
|
|
|
INIT_LIST_HEAD(&work->list);
|
|
|
|
work->inode = inode;
|
|
|
|
work->delay_iput = delay_iput;
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 23:36:53 +08:00
|
|
|
WARN_ON_ONCE(!inode);
|
|
|
|
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
|
|
|
|
btrfs_run_delalloc_work, NULL, NULL);
|
2012-10-25 17:28:04 +08:00
|
|
|
|
|
|
|
return work;
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
|
|
|
|
{
|
|
|
|
wait_for_completion(&work->completion);
|
2015-12-08 21:39:32 +08:00
|
|
|
kfree(work);
|
2012-10-25 17:28:04 +08:00
|
|
|
}
|
|
|
|
|
2008-09-30 03:18:18 +08:00
|
|
|
/*
|
|
|
|
* some fairly slow code that needs optimization. This walks the list
|
|
|
|
* of all the inodes with pending delalloc and forces them to disk.
|
|
|
|
*/
|
2014-03-06 13:55:01 +08:00
|
|
|
static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
|
|
|
|
int nr)
|
2008-08-05 11:17:27 +08:00
|
|
|
{
|
|
|
|
struct btrfs_inode *binode;
|
2008-09-26 22:05:38 +08:00
|
|
|
struct inode *inode;
|
2012-10-25 17:28:04 +08:00
|
|
|
struct btrfs_delalloc_work *work, *next;
|
|
|
|
struct list_head works;
|
2013-01-22 18:49:00 +08:00
|
|
|
struct list_head splice;
|
2012-10-25 17:28:04 +08:00
|
|
|
int ret = 0;
|
2008-08-05 11:17:27 +08:00
|
|
|
|
2012-10-25 17:28:04 +08:00
|
|
|
INIT_LIST_HEAD(&works);
|
2013-01-22 18:49:00 +08:00
|
|
|
INIT_LIST_HEAD(&splice);
|
2013-01-22 18:50:35 +08:00
|
|
|
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_lock(&root->delalloc_mutex);
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
list_splice_init(&root->delalloc_inodes, &splice);
|
2013-01-22 18:49:00 +08:00
|
|
|
while (!list_empty(&splice)) {
|
|
|
|
binode = list_entry(splice.next, struct btrfs_inode,
|
2008-08-05 11:17:27 +08:00
|
|
|
delalloc_inodes);
|
2013-01-22 18:49:00 +08:00
|
|
|
|
2013-05-15 15:48:22 +08:00
|
|
|
list_move_tail(&binode->delalloc_inodes,
|
|
|
|
&root->delalloc_inodes);
|
2008-09-26 22:05:38 +08:00
|
|
|
inode = igrab(&binode->vfs_inode);
|
2013-01-29 18:11:59 +08:00
|
|
|
if (!inode) {
|
2013-05-15 15:48:22 +08:00
|
|
|
cond_resched_lock(&root->delalloc_lock);
|
2013-01-22 18:49:00 +08:00
|
|
|
continue;
|
2013-01-29 18:11:59 +08:00
|
|
|
}
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
2013-01-22 18:49:00 +08:00
|
|
|
|
2015-11-28 02:24:16 +08:00
|
|
|
work = btrfs_alloc_delalloc_work(inode, delay_iput);
|
2014-09-30 01:20:37 +08:00
|
|
|
if (!work) {
|
2013-09-17 23:25:44 +08:00
|
|
|
if (delay_iput)
|
|
|
|
btrfs_add_delayed_iput(inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
2013-01-22 18:49:00 +08:00
|
|
|
ret = -ENOMEM;
|
2014-04-02 19:53:32 +08:00
|
|
|
goto out;
|
2008-09-26 22:05:38 +08:00
|
|
|
}
|
2013-01-22 18:49:00 +08:00
|
|
|
list_add_tail(&work->list, &works);
|
2014-02-28 10:46:09 +08:00
|
|
|
btrfs_queue_work(root->fs_info->flush_workers,
|
|
|
|
&work->work);
|
2014-03-06 13:55:01 +08:00
|
|
|
ret++;
|
|
|
|
if (nr != -1 && ret >= nr)
|
2014-04-02 19:53:32 +08:00
|
|
|
goto out;
|
2008-09-26 22:05:38 +08:00
|
|
|
cond_resched();
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_lock(&root->delalloc_lock);
|
2008-08-05 11:17:27 +08:00
|
|
|
}
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_unlock(&root->delalloc_lock);
|
2008-09-29 23:19:10 +08:00
|
|
|
|
2014-04-02 19:53:32 +08:00
|
|
|
out:
|
2013-05-15 15:48:22 +08:00
|
|
|
list_for_each_entry_safe(work, next, &works, list) {
|
|
|
|
list_del_init(&work->list);
|
|
|
|
btrfs_wait_and_free_delalloc_work(work);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!list_empty_careful(&splice)) {
|
|
|
|
spin_lock(&root->delalloc_lock);
|
|
|
|
list_splice_tail(&splice, &root->delalloc_inodes);
|
|
|
|
spin_unlock(&root->delalloc_lock);
|
|
|
|
}
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_unlock(&root->delalloc_mutex);
|
2013-05-15 15:48:22 +08:00
|
|
|
return ret;
|
|
|
|
}
|
2013-01-22 18:49:00 +08:00
|
|
|
|
2013-05-15 15:48:22 +08:00
|
|
|
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
|
|
|
|
{
|
|
|
|
int ret;
|
2013-01-22 18:49:00 +08:00
|
|
|
|
2014-01-14 19:42:20 +08:00
|
|
|
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
|
2013-05-15 15:48:22 +08:00
|
|
|
return -EROFS;
|
|
|
|
|
2014-03-06 13:55:01 +08:00
|
|
|
ret = __start_delalloc_inodes(root, delay_iput, -1);
|
|
|
|
if (ret > 0)
|
|
|
|
ret = 0;
|
2013-05-15 15:48:22 +08:00
|
|
|
/*
|
|
|
|
* the filemap_flush will queue IO into the worker threads, but
|
2008-09-29 23:19:10 +08:00
|
|
|
* we have to make sure the IO is actually started and that
|
|
|
|
* ordered extents get created before we return
|
|
|
|
*/
|
|
|
|
atomic_inc(&root->fs_info->async_submit_draining);
|
2009-01-06 10:25:51 +08:00
|
|
|
while (atomic_read(&root->fs_info->nr_async_submits) ||
|
2008-11-07 11:02:51 +08:00
|
|
|
atomic_read(&root->fs_info->async_delalloc_pages)) {
|
2008-09-29 23:19:10 +08:00
|
|
|
wait_event(root->fs_info->async_submit_wait,
|
2008-11-07 11:02:51 +08:00
|
|
|
(atomic_read(&root->fs_info->nr_async_submits) == 0 &&
|
|
|
|
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
|
2008-09-29 23:19:10 +08:00
|
|
|
}
|
|
|
|
atomic_dec(&root->fs_info->async_submit_draining);
|
2013-05-15 15:48:22 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-03-06 13:55:01 +08:00
|
|
|
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
|
|
|
|
int nr)
|
2013-05-15 15:48:22 +08:00
|
|
|
{
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct list_head splice;
|
|
|
|
int ret;
|
|
|
|
|
2014-01-14 19:42:20 +08:00
|
|
|
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
|
2013-05-15 15:48:22 +08:00
|
|
|
return -EROFS;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&splice);
|
|
|
|
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_lock(&fs_info->delalloc_root_mutex);
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
|
|
|
list_splice_init(&fs_info->delalloc_roots, &splice);
|
2014-03-06 13:55:01 +08:00
|
|
|
while (!list_empty(&splice) && nr) {
|
2013-05-15 15:48:22 +08:00
|
|
|
root = list_first_entry(&splice, struct btrfs_root,
|
|
|
|
delalloc_root);
|
|
|
|
root = btrfs_grab_fs_root(root);
|
|
|
|
BUG_ON(!root);
|
|
|
|
list_move_tail(&root->delalloc_root,
|
|
|
|
&fs_info->delalloc_roots);
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
|
|
|
|
2014-03-06 13:55:01 +08:00
|
|
|
ret = __start_delalloc_inodes(root, delay_iput, nr);
|
2013-05-15 15:48:22 +08:00
|
|
|
btrfs_put_fs_root(root);
|
2014-03-06 13:55:01 +08:00
|
|
|
if (ret < 0)
|
2013-05-15 15:48:22 +08:00
|
|
|
goto out;
|
|
|
|
|
2014-03-06 13:55:01 +08:00
|
|
|
if (nr != -1) {
|
|
|
|
nr -= ret;
|
|
|
|
WARN_ON(nr < 0);
|
|
|
|
}
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
2012-10-25 17:28:04 +08:00
|
|
|
}
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-01-22 18:49:00 +08:00
|
|
|
|
2014-03-06 13:55:01 +08:00
|
|
|
ret = 0;
|
2013-05-15 15:48:22 +08:00
|
|
|
atomic_inc(&fs_info->async_submit_draining);
|
|
|
|
while (atomic_read(&fs_info->nr_async_submits) ||
|
|
|
|
atomic_read(&fs_info->async_delalloc_pages)) {
|
|
|
|
wait_event(fs_info->async_submit_wait,
|
|
|
|
(atomic_read(&fs_info->nr_async_submits) == 0 &&
|
|
|
|
atomic_read(&fs_info->async_delalloc_pages) == 0));
|
|
|
|
}
|
|
|
|
atomic_dec(&fs_info->async_submit_draining);
|
|
|
|
out:
|
2013-01-22 18:49:00 +08:00
|
|
|
if (!list_empty_careful(&splice)) {
|
2013-05-15 15:48:22 +08:00
|
|
|
spin_lock(&fs_info->delalloc_root_lock);
|
|
|
|
list_splice_tail(&splice, &fs_info->delalloc_roots);
|
|
|
|
spin_unlock(&fs_info->delalloc_root_lock);
|
2013-01-22 18:49:00 +08:00
|
|
|
}
|
2014-03-06 13:55:03 +08:00
|
|
|
mutex_unlock(&fs_info->delalloc_root_mutex);
|
2012-10-25 17:28:04 +08:00
|
|
|
return ret;
|
2008-08-05 11:17:27 +08:00
|
|
|
}
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
|
|
|
|
const char *symname)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2007-12-22 05:27:21 +08:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 18:35:45 +08:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
u64 objectid;
|
2013-10-31 13:03:04 +08:00
|
|
|
u64 index = 0;
|
2007-06-12 18:35:45 +08:00
|
|
|
int name_len;
|
|
|
|
int datasize;
|
2007-10-16 04:14:19 +08:00
|
|
|
unsigned long ptr;
|
2007-06-12 18:35:45 +08:00
|
|
|
struct btrfs_file_extent_item *ei;
|
2007-10-16 04:14:19 +08:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2013-09-16 16:53:28 +08:00
|
|
|
name_len = strlen(symname);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
|
|
|
|
return -ENAMETOOLONG;
|
2007-12-22 05:27:21 +08:00
|
|
|
|
2009-09-12 04:12:44 +08:00
|
|
|
/*
|
|
|
|
* 2 items for inode item and ref
|
|
|
|
* 2 items for dir items
|
2016-01-01 02:16:29 +08:00
|
|
|
* 1 item for updating parent inode item
|
|
|
|
* 1 item for the inline extent item
|
2009-09-12 04:12:44 +08:00
|
|
|
* 1 item for xattr if selinux is on
|
|
|
|
*/
|
2016-01-01 02:16:29 +08:00
|
|
|
trans = btrfs_start_transaction(root, 7);
|
2010-05-16 22:48:46 +08:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-12-22 05:27:21 +08:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-20 10:06:11 +08:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-25 00:12:38 +08:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-20 10:31:50 +08:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-12 03:26:06 +08:00
|
|
|
S_IFLNK|S_IRWXUGO, &index);
|
2011-04-26 07:43:53 +08:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
goto out_unlock;
|
2011-04-26 07:43:53 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2011-12-15 23:09:07 +08:00
|
|
|
/*
|
|
|
|
* If the active LSM wants to access the inode during
|
|
|
|
* d_instantiate it needs these. Smack checks to see
|
|
|
|
* if the filesystem supports xattrs by looking at the
|
|
|
|
* ops vector.
|
|
|
|
*/
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
2014-09-09 04:08:51 +08:00
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
|
|
|
|
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock_inode;
|
2011-12-15 23:09:07 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_unlock_inode;
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-14 01:38:47 +08:00
|
|
|
}
|
2011-04-20 10:31:50 +08:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
key.offset = 0;
|
2014-06-05 00:41:45 +08:00
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
2007-06-12 18:35:45 +08:00
|
|
|
datasize = btrfs_file_extent_calc_inline_size(name_len);
|
|
|
|
err = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
datasize);
|
2007-06-23 02:16:25 +08:00
|
|
|
if (err) {
|
2011-05-14 15:10:51 +08:00
|
|
|
btrfs_free_path(path);
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_unlock_inode;
|
2007-06-23 02:16:25 +08:00
|
|
|
}
|
2007-10-16 04:14:19 +08:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, ei,
|
2007-06-12 18:35:45 +08:00
|
|
|
BTRFS_FILE_EXTENT_INLINE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-30 02:49:59 +08:00
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
|
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
2007-10-16 04:14:19 +08:00
|
|
|
write_extent_buffer(leaf, symname, ptr, name_len);
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 18:35:45 +08:00
|
|
|
btrfs_free_path(path);
|
2007-10-16 04:14:19 +08:00
|
|
|
|
2007-06-12 18:35:45 +08:00
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
2015-11-17 14:07:57 +08:00
|
|
|
inode_nohighmem(inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
inode->i_mapping->a_ops = &btrfs_symlink_aops;
|
2008-10-31 02:25:28 +08:00
|
|
|
inode_set_bytes(inode, name_len);
|
2013-09-16 16:53:28 +08:00
|
|
|
btrfs_i_size_write(inode, name_len);
|
2007-06-23 02:16:25 +08:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
2016-01-01 02:08:24 +08:00
|
|
|
/*
|
|
|
|
* Last step, add directory indexes for our symlink inode. This is the
|
|
|
|
* last step to avoid extra cleanup of these indexes if an error happens
|
|
|
|
* elsewhere above.
|
|
|
|
*/
|
|
|
|
if (!err)
|
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
|
2014-09-09 04:08:51 +08:00
|
|
|
if (err) {
|
2007-06-23 02:16:25 +08:00
|
|
|
drop_inode = 1;
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_unlock_inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
d_instantiate(dentry, inode);
|
2007-06-12 18:35:45 +08:00
|
|
|
|
|
|
|
out_unlock:
|
2012-01-13 08:10:12 +08:00
|
|
|
btrfs_end_transaction(trans, root);
|
2007-06-12 18:35:45 +08:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2012-11-14 22:34:34 +08:00
|
|
|
btrfs_btree_balance_dirty(root);
|
2007-06-12 18:35:45 +08:00
|
|
|
return err;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
out_unlock_inode:
|
|
|
|
drop_inode = 1;
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out_unlock;
|
2007-06-12 18:35:45 +08:00
|
|
|
}
|
2008-04-10 22:23:21 +08:00
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint,
|
|
|
|
struct btrfs_trans_handle *trans)
|
2008-10-31 02:25:28 +08:00
|
|
|
{
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_map *em;
|
2008-10-31 02:25:28 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 cur_offset = start;
|
2010-11-23 02:50:32 +08:00
|
|
|
u64 i_size;
|
2013-03-06 00:11:26 +08:00
|
|
|
u64 cur_bytes;
|
2015-09-24 05:11:16 +08:00
|
|
|
u64 last_alloc = (u64)-1;
|
2008-10-31 02:25:28 +08:00
|
|
|
int ret = 0;
|
2010-06-22 02:48:16 +08:00
|
|
|
bool own_trans = true;
|
2008-10-31 02:25:28 +08:00
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
if (trans)
|
|
|
|
own_trans = false;
|
2008-10-31 02:25:28 +08:00
|
|
|
while (num_bytes > 0) {
|
2010-06-22 02:48:16 +08:00
|
|
|
if (own_trans) {
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
break;
|
|
|
|
}
|
2009-11-12 17:34:52 +08:00
|
|
|
}
|
|
|
|
|
2015-12-15 00:42:10 +08:00
|
|
|
cur_bytes = min_t(u64, num_bytes, SZ_256M);
|
2013-03-06 00:11:26 +08:00
|
|
|
cur_bytes = max(cur_bytes, min_size);
|
2015-09-24 05:11:16 +08:00
|
|
|
/*
|
|
|
|
* If we are severely fragmented we could end up with really
|
|
|
|
* small allocations, so if the allocator is returning small
|
|
|
|
* chunks lets make its job easier by only searching for those
|
|
|
|
* sized chunks.
|
|
|
|
*/
|
|
|
|
cur_bytes = min(cur_bytes, last_alloc);
|
2013-08-15 02:02:47 +08:00
|
|
|
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
*alloc_hint, &ins, 1, 0);
|
2009-11-12 17:34:52 +08:00
|
|
|
if (ret) {
|
2010-06-22 02:48:16 +08:00
|
|
|
if (own_trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-05-16 22:48:46 +08:00
|
|
|
break;
|
2008-10-31 02:25:28 +08:00
|
|
|
}
|
2009-11-12 17:34:52 +08:00
|
|
|
|
2015-09-24 05:11:16 +08:00
|
|
|
last_alloc = ins.offset;
|
2008-10-31 02:25:28 +08:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
cur_offset, ins.objectid,
|
|
|
|
ins.offset, ins.offset,
|
2009-11-12 17:34:08 +08:00
|
|
|
ins.offset, 0, 0, 0,
|
2008-10-31 02:25:28 +08:00
|
|
|
BTRFS_FILE_EXTENT_PREALLOC);
|
2012-03-12 23:03:00 +08:00
|
|
|
if (ret) {
|
2013-10-08 03:21:08 +08:00
|
|
|
btrfs_free_reserved_extent(root, ins.objectid,
|
Btrfs: fix broken free space cache after the system crashed
When we mounted the filesystem after the crash, we got the following
message:
BTRFS error (device xxx): block group xxxx has wrong amount of free space
BTRFS error (device xxx): failed to load free space cache for block group xxx
It is because we didn't update the metadata of the allocated space (in extent
tree) until the file data was written into the disk. During this time, there was
no information about the allocated spaces in either the extent tree nor the
free space cache. when we wrote out the free space cache at this time (commit
transaction), those spaces were lost. In fact, only the free space that is
used to store the file data had this problem, the others didn't because
the metadata of them is updated in the same transaction context.
There are many methods which can fix the above problem
- track the allocated space, and write it out when we write out the free
space cache
- account the size of the allocated space that is used to store the file
data, if the size is not zero, don't write out the free space cache.
The first one is complex and may make the performance drop down.
This patch chose the second method, we use a per-block-group variant to
account the size of that allocated space. Besides that, we also introduce
a per-block-group read-write semaphore to avoid the race between
the allocation and the free space cache write out.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-06-19 10:42:50 +08:00
|
|
|
ins.offset, 0);
|
2012-03-12 23:03:00 +08:00
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
if (own_trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
break;
|
|
|
|
}
|
2014-12-12 16:44:35 +08:00
|
|
|
|
2009-09-12 00:27:37 +08:00
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
|
|
|
cur_offset + ins.offset -1, 0);
|
2009-11-12 17:34:52 +08:00
|
|
|
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
em = alloc_extent_map();
|
|
|
|
if (!em) {
|
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
|
|
|
|
&BTRFS_I(inode)->runtime_flags);
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
em->start = cur_offset;
|
|
|
|
em->orig_start = cur_offset;
|
|
|
|
em->len = ins.offset;
|
|
|
|
em->block_start = ins.objectid;
|
|
|
|
em->block_len = ins.offset;
|
2012-12-03 23:31:19 +08:00
|
|
|
em->orig_block_len = ins.offset;
|
2013-04-05 02:31:27 +08:00
|
|
|
em->ram_bytes = ins.offset;
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
|
|
|
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
|
|
|
|
em->generation = trans->transid;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
write_lock(&em_tree->lock);
|
2013-04-06 04:51:15 +08:00
|
|
|
ret = add_extent_mapping(em_tree, em, 1);
|
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-18 01:14:17 +08:00
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
if (ret != -EEXIST)
|
|
|
|
break;
|
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
|
|
|
cur_offset + ins.offset - 1,
|
|
|
|
0);
|
|
|
|
}
|
|
|
|
free_extent_map(em);
|
|
|
|
next:
|
2008-10-31 02:25:28 +08:00
|
|
|
num_bytes -= ins.offset;
|
|
|
|
cur_offset += ins.offset;
|
2010-05-16 22:49:59 +08:00
|
|
|
*alloc_hint = ins.objectid + ins.offset;
|
2009-11-12 17:34:52 +08:00
|
|
|
|
2012-04-06 03:03:02 +08:00
|
|
|
inode_inc_iversion(inode);
|
2008-10-31 02:25:28 +08:00
|
|
|
inode->i_ctime = CURRENT_TIME;
|
2009-04-17 16:37:41 +08:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
|
2008-10-31 02:25:28 +08:00
|
|
|
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
2010-05-16 22:49:59 +08:00
|
|
|
(actual_len > inode->i_size) &&
|
|
|
|
(cur_offset > inode->i_size)) {
|
2010-01-20 15:28:54 +08:00
|
|
|
if (cur_offset > actual_len)
|
2010-11-23 02:50:32 +08:00
|
|
|
i_size = actual_len;
|
2010-01-20 15:28:54 +08:00
|
|
|
else
|
2010-11-23 02:50:32 +08:00
|
|
|
i_size = cur_offset;
|
|
|
|
i_size_write(inode, i_size);
|
|
|
|
btrfs_ordered_update_i_size(inode, i_size, NULL);
|
2009-11-12 17:34:52 +08:00
|
|
|
}
|
|
|
|
|
2008-10-31 02:25:28 +08:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2012-03-12 23:03:00 +08:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, root, ret);
|
|
|
|
if (own_trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
break;
|
|
|
|
}
|
2008-10-31 02:25:28 +08:00
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
if (own_trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
2009-11-12 17:34:52 +08:00
|
|
|
}
|
2008-10-31 02:25:28 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-06-22 02:48:16 +08:00
|
|
|
int btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
{
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
min_size, actual_len, alloc_hint,
|
|
|
|
NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_prealloc_file_range_trans(struct inode *inode,
|
|
|
|
struct btrfs_trans_handle *trans, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
{
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
min_size, actual_len, alloc_hint, trans);
|
|
|
|
}
|
|
|
|
|
2008-07-18 00:53:50 +08:00
|
|
|
static int btrfs_set_page_dirty(struct page *page)
|
|
|
|
{
|
|
|
|
return __set_page_dirty_nobuffers(page);
|
|
|
|
}
|
|
|
|
|
2011-06-21 07:28:19 +08:00
|
|
|
static int btrfs_permission(struct inode *inode, int mask)
|
2008-01-15 02:26:08 +08:00
|
|
|
{
|
2010-12-20 16:04:08 +08:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-08-16 01:27:21 +08:00
|
|
|
umode_t mode = inode->i_mode;
|
2010-12-20 16:04:08 +08:00
|
|
|
|
2011-08-16 01:27:21 +08:00
|
|
|
if (mask & MAY_WRITE &&
|
|
|
|
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
|
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
|
|
|
|
return -EACCES;
|
|
|
|
}
|
2011-06-21 07:16:29 +08:00
|
|
|
return generic_permission(inode, mask);
|
2008-01-15 02:26:08 +08:00
|
|
|
}
|
2007-06-12 18:35:45 +08:00
|
|
|
|
2014-04-28 03:40:45 +08:00
|
|
|
static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct inode *inode = NULL;
|
|
|
|
u64 objectid;
|
|
|
|
u64 index;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 5 units required for adding orphan entry
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
|
|
|
ret = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
inode = btrfs_new_inode(trans, root, dir, NULL, 0,
|
|
|
|
btrfs_ino(dir), objectid, mode, &index);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
ret = PTR_ERR(inode);
|
|
|
|
inode = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
|
|
|
|
2014-09-09 04:08:51 +08:00
|
|
|
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
|
|
|
|
if (ret)
|
|
|
|
goto out_inode;
|
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret)
|
|
|
|
goto out_inode;
|
2014-04-28 03:40:45 +08:00
|
|
|
ret = btrfs_orphan_add(trans, inode);
|
|
|
|
if (ret)
|
2014-09-09 04:08:51 +08:00
|
|
|
goto out_inode;
|
2014-04-28 03:40:45 +08:00
|
|
|
|
2014-08-01 07:10:32 +08:00
|
|
|
/*
|
|
|
|
* We set number of links to 0 in btrfs_new_inode(), and here we set
|
|
|
|
* it to 1 because d_tmpfile() will issue a warning if the count is 0,
|
|
|
|
* through:
|
|
|
|
*
|
|
|
|
* d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
|
|
|
|
*/
|
|
|
|
set_nlink(inode, 1);
|
2014-09-09 04:08:51 +08:00
|
|
|
unlock_new_inode(inode);
|
2014-04-28 03:40:45 +08:00
|
|
|
d_tmpfile(dentry, inode);
|
|
|
|
mark_inode_dirty(inode);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
if (ret)
|
|
|
|
iput(inode);
|
|
|
|
btrfs_balance_delayed_items(root);
|
|
|
|
btrfs_btree_balance_dirty(root);
|
|
|
|
return ret;
|
2014-09-09 04:08:51 +08:00
|
|
|
|
|
|
|
out_inode:
|
|
|
|
unlock_new_inode(inode);
|
|
|
|
goto out;
|
|
|
|
|
2014-04-28 03:40:45 +08:00
|
|
|
}
|
|
|
|
|
Btrfs: ensure ordered extent errors aren't missed on fsync
When doing a fsync with a fast path we have a time window where we can miss
the fact that writeback of some file data failed, and therefore we endup
returning success (0) from fsync when we should return an error.
The steps that lead to this are the following:
1) We start all ordered extents by calling filemap_fdatawrite_range();
2) We do some other work like locking the inode's i_mutex, start a transaction,
start a log transaction, etc;
3) We enter btrfs_log_inode(), acquire the inode's log_mutex and collect all the
ordered extents from inode's ordered tree into a list;
4) But by the time we do ordered extent collection, some ordered extents we started
at step 1) might have already completed with an error, and therefore we didn't
found them in the ordered tree and had no idea they finished with an error. This
makes our fsync return success (0) to userspace, but has no bad effects on the log
like for example insertion of file extent items into the log that point to unwritten
extents, because the invalid extent maps were removed before the ordered extent
completed (in inode.c:btrfs_finish_ordered_io).
So after collecting the ordered extents just check if the inode's i_mapping has any
error flags set (AS_EIO or AS_ENOSPC) and leave with an error if it does. Whenever
writeback fails for a page of an ordered extent, we call mapping_set_error (done in
extent_io.c:end_extent_writepage, called by extent_io.c:end_bio_extent_writepage)
that sets one of those error flags in the inode's i_mapping flags.
This change also has the side effect of fixing the issue where for fast fsyncs we
never checked/cleared the error flags from the inode's i_mapping flags, which means
that a full fsync performed after a fast fsync could get such errors that belonged
to the fast fsync - because the full fsync calls btrfs_wait_ordered_range() which
calls filemap_fdatawait_range(), and the later checks for and clears those flags,
while for fast fsyncs we never call filemap_fdatawait_range() or anything else
that checks for and clears the error flags from the inode's i_mapping.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-11-14 01:01:45 +08:00
|
|
|
/* Inspired by filemap_check_errors() */
|
|
|
|
int btrfs_inode_check_errors(struct inode *inode)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
|
|
|
|
test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
|
|
|
|
ret = -ENOSPC;
|
|
|
|
if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
|
|
|
|
test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
|
|
|
|
ret = -EIO;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-09-22 08:01:11 +08:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations = {
|
2008-11-18 09:42:26 +08:00
|
|
|
.getattr = btrfs_getattr,
|
2007-06-12 18:35:45 +08:00
|
|
|
.lookup = btrfs_lookup,
|
|
|
|
.create = btrfs_create,
|
|
|
|
.unlink = btrfs_unlink,
|
|
|
|
.link = btrfs_link,
|
|
|
|
.mkdir = btrfs_mkdir,
|
|
|
|
.rmdir = btrfs_rmdir,
|
2014-07-23 21:15:32 +08:00
|
|
|
.rename2 = btrfs_rename2,
|
2007-06-12 18:35:45 +08:00
|
|
|
.symlink = btrfs_symlink,
|
|
|
|
.setattr = btrfs_setattr,
|
2007-07-11 22:18:17 +08:00
|
|
|
.mknod = btrfs_mknod,
|
2008-08-28 18:21:17 +08:00
|
|
|
.setxattr = btrfs_setxattr,
|
2015-12-02 21:44:37 +08:00
|
|
|
.getxattr = generic_getxattr,
|
2007-11-17 00:45:54 +08:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-08-28 18:21:17 +08:00
|
|
|
.removexattr = btrfs_removexattr,
|
2008-01-15 02:26:08 +08:00
|
|
|
.permission = btrfs_permission,
|
2011-07-23 23:37:31 +08:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 21:16:43 +08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2013-09-17 01:42:03 +08:00
|
|
|
.update_time = btrfs_update_time,
|
2014-04-28 03:40:45 +08:00
|
|
|
.tmpfile = btrfs_tmpfile,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
2009-09-22 08:01:11 +08:00
|
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations = {
|
2007-06-12 18:35:45 +08:00
|
|
|
.lookup = btrfs_lookup,
|
2008-01-15 02:26:08 +08:00
|
|
|
.permission = btrfs_permission,
|
2011-07-23 23:37:31 +08:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 21:16:43 +08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2013-09-17 01:42:03 +08:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
2009-09-22 04:00:26 +08:00
|
|
|
|
2009-10-02 06:43:56 +08:00
|
|
|
static const struct file_operations btrfs_dir_file_operations = {
|
2007-06-12 18:35:45 +08:00
|
|
|
.llseek = generic_file_llseek,
|
|
|
|
.read = generic_read_dir,
|
2013-05-23 04:48:09 +08:00
|
|
|
.iterate = btrfs_real_readdir,
|
2007-09-14 22:22:47 +08:00
|
|
|
.unlocked_ioctl = btrfs_ioctl,
|
2007-06-12 18:35:45 +08:00
|
|
|
#ifdef CONFIG_COMPAT
|
2007-09-14 22:22:47 +08:00
|
|
|
.compat_ioctl = btrfs_ioctl,
|
2007-06-12 18:35:45 +08:00
|
|
|
#endif
|
2008-06-10 22:07:39 +08:00
|
|
|
.release = btrfs_release_file,
|
2008-09-06 04:13:11 +08:00
|
|
|
.fsync = btrfs_sync_file,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
|
|
|
|
2015-11-19 18:42:28 +08:00
|
|
|
static const struct extent_io_ops btrfs_extent_io_ops = {
|
2007-08-30 20:50:51 +08:00
|
|
|
.fill_delalloc = run_delalloc_range,
|
2008-02-21 01:07:25 +08:00
|
|
|
.submit_bio_hook = btrfs_submit_bio_hook,
|
2008-03-25 03:02:07 +08:00
|
|
|
.merge_bio_hook = btrfs_merge_bio_hook,
|
2007-08-30 20:50:51 +08:00
|
|
|
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
|
2008-07-18 00:53:50 +08:00
|
|
|
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
|
2008-07-18 00:53:51 +08:00
|
|
|
.writepage_start_hook = btrfs_writepage_start_hook,
|
2008-02-01 00:05:37 +08:00
|
|
|
.set_bit_hook = btrfs_set_bit_hook,
|
|
|
|
.clear_bit_hook = btrfs_clear_bit_hook,
|
2009-09-12 04:12:44 +08:00
|
|
|
.merge_extent_hook = btrfs_merge_extent_hook,
|
|
|
|
.split_extent_hook = btrfs_split_extent_hook,
|
2007-08-30 20:50:51 +08:00
|
|
|
};
|
|
|
|
|
2009-01-22 02:11:13 +08:00
|
|
|
/*
|
|
|
|
* btrfs doesn't support the bmap operation because swapfiles
|
|
|
|
* use bmap to make a mapping of extents in the file. They assume
|
|
|
|
* these extents won't change over the life of the file and they
|
|
|
|
* use the bmap result to do IO directly to the drive.
|
|
|
|
*
|
|
|
|
* the btrfs bmap call would return logical addresses that aren't
|
|
|
|
* suitable for IO and they also will change frequently as COW
|
|
|
|
* operations happen. So, swapfile + btrfs == corruption.
|
|
|
|
*
|
|
|
|
* For now we're avoiding this by dropping bmap.
|
|
|
|
*/
|
2009-09-22 08:01:10 +08:00
|
|
|
static const struct address_space_operations btrfs_aops = {
|
2007-06-12 18:35:45 +08:00
|
|
|
.readpage = btrfs_readpage,
|
|
|
|
.writepage = btrfs_writepage,
|
2007-11-02 07:45:34 +08:00
|
|
|
.writepages = btrfs_writepages,
|
2007-11-08 23:59:22 +08:00
|
|
|
.readpages = btrfs_readpages,
|
2008-04-10 22:23:21 +08:00
|
|
|
.direct_IO = btrfs_direct_IO,
|
2007-08-28 04:49:44 +08:00
|
|
|
.invalidatepage = btrfs_invalidatepage,
|
|
|
|
.releasepage = btrfs_releasepage,
|
2008-07-18 00:53:50 +08:00
|
|
|
.set_page_dirty = btrfs_set_page_dirty,
|
2009-09-16 17:50:18 +08:00
|
|
|
.error_remove_page = generic_error_remove_page,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
|
|
|
|
2009-09-22 08:01:10 +08:00
|
|
|
static const struct address_space_operations btrfs_symlink_aops = {
|
2007-06-12 18:35:45 +08:00
|
|
|
.readpage = btrfs_readpage,
|
|
|
|
.writepage = btrfs_writepage,
|
2007-08-30 23:54:02 +08:00
|
|
|
.invalidatepage = btrfs_invalidatepage,
|
|
|
|
.releasepage = btrfs_releasepage,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
|
|
|
|
2009-09-22 08:01:11 +08:00
|
|
|
static const struct inode_operations btrfs_file_inode_operations = {
|
2007-06-12 18:35:45 +08:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
.setattr = btrfs_setattr,
|
2008-08-28 18:21:17 +08:00
|
|
|
.setxattr = btrfs_setxattr,
|
2015-12-02 21:44:37 +08:00
|
|
|
.getxattr = generic_getxattr,
|
2007-11-17 00:45:54 +08:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-08-28 18:21:17 +08:00
|
|
|
.removexattr = btrfs_removexattr,
|
2008-01-15 02:26:08 +08:00
|
|
|
.permission = btrfs_permission,
|
2009-01-22 03:39:14 +08:00
|
|
|
.fiemap = btrfs_fiemap,
|
2011-07-23 23:37:31 +08:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 21:16:43 +08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2012-03-26 21:46:47 +08:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
2009-09-22 08:01:11 +08:00
|
|
|
static const struct inode_operations btrfs_special_inode_operations = {
|
2007-07-11 22:18:17 +08:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
.setattr = btrfs_setattr,
|
2008-01-15 02:26:08 +08:00
|
|
|
.permission = btrfs_permission,
|
2008-08-28 18:21:17 +08:00
|
|
|
.setxattr = btrfs_setxattr,
|
2015-12-02 21:44:37 +08:00
|
|
|
.getxattr = generic_getxattr,
|
2008-07-25 00:16:36 +08:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-08-28 18:21:17 +08:00
|
|
|
.removexattr = btrfs_removexattr,
|
2011-07-23 23:37:31 +08:00
|
|
|
.get_acl = btrfs_get_acl,
|
2013-12-20 21:16:43 +08:00
|
|
|
.set_acl = btrfs_set_acl,
|
2012-03-26 21:46:47 +08:00
|
|
|
.update_time = btrfs_update_time,
|
2007-07-11 22:18:17 +08:00
|
|
|
};
|
2009-09-22 08:01:11 +08:00
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations = {
|
2007-06-12 18:35:45 +08:00
|
|
|
.readlink = generic_readlink,
|
2015-11-17 23:20:54 +08:00
|
|
|
.get_link = page_get_link,
|
2010-11-19 10:05:24 +08:00
|
|
|
.getattr = btrfs_getattr,
|
2011-11-30 23:45:38 +08:00
|
|
|
.setattr = btrfs_setattr,
|
2008-01-15 02:26:08 +08:00
|
|
|
.permission = btrfs_permission,
|
2009-02-04 22:29:13 +08:00
|
|
|
.setxattr = btrfs_setxattr,
|
2015-12-02 21:44:37 +08:00
|
|
|
.getxattr = generic_getxattr,
|
2009-02-04 22:29:13 +08:00
|
|
|
.listxattr = btrfs_listxattr,
|
|
|
|
.removexattr = btrfs_removexattr,
|
2012-03-26 21:46:47 +08:00
|
|
|
.update_time = btrfs_update_time,
|
2007-06-12 18:35:45 +08:00
|
|
|
};
|
2009-09-22 04:00:26 +08:00
|
|
|
|
2009-10-09 21:54:36 +08:00
|
|
|
const struct dentry_operations btrfs_dentry_operations = {
|
2009-09-22 04:00:26 +08:00
|
|
|
.d_delete = btrfs_dentry_delete,
|
2011-06-29 04:18:59 +08:00
|
|
|
.d_release = btrfs_dentry_release,
|
2009-09-22 04:00:26 +08:00
|
|
|
};
|