From 69e9c6c6dc87587d8846e447febedefe037c14f8 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Thu, 5 Sep 2013 16:58:43 +0200
Subject: [PATCH 01/98] Btrfs: eliminate the exceptional root_tree refs=0

The fact that btrfs_root_refs() returned 0 for the tree_root caused
bugs in the past, therefore it is set to 1 with this patch and
(hopefully) all affected code is adapted to this change.

I verified this change by temporarily adding WARN_ON() checks
everywhere where btrfs_root_refs() is used, checking whether the
logic of the code is changed by btrfs_root_refs() returning 1
instead of 0 for root->root_key.objectid == BTRFS_ROOT_TREE_OBJECTID.
With these added checks, I ran the xfstests './check -g auto'.

The two roots chunk_root and log_root_tree that are only referenced
by the superblock and the log_roots below the log_root_tree still
have btrfs_root_refs() == 0, only the tree_root is changed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/inode-map.c |  3 +--
 fs/btrfs/inode.c     | 21 ++++++++-------------
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62176ad89846..f724397b396b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2670,6 +2670,7 @@ retry_root_backup:
 
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
+	btrfs_set_root_refs(&tree_root->root_item, 1);
 
 	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2c66ddbbe670..d11e1c6610b6 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -412,8 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
 		return 0;
 
 	/* Don't save inode cache if we are deleting this root */
-	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root)
+	if (btrfs_root_refs(&root->root_item) == 0)
 		return 0;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51e3afa78354..dfc60aa40332 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4472,8 +4472,10 @@ void btrfs_evict_inode(struct inode *inode)
 	trace_btrfs_inode_evict(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
-	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-			       btrfs_is_free_space_inode(inode)))
+	if (inode->i_nlink &&
+	    ((btrfs_root_refs(&root->root_item) != 0 &&
+	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+	     btrfs_is_free_space_inode(inode)))
 		goto no_delete;
 
 	if (is_bad_inode(inode)) {
@@ -4490,7 +4492,8 @@ void btrfs_evict_inode(struct inode *inode)
 	}
 
 	if (inode->i_nlink > 0) {
-		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
 		goto no_delete;
 	}
 
@@ -4731,14 +4734,7 @@ static void inode_tree_del(struct inode *inode)
 	}
 	spin_unlock(&root->inode_lock);
 
-	/*
-	 * Free space cache has inodes in the tree root, but the tree root has a
-	 * root_refs of 0, so this could end up dropping the tree root as a
-	 * snapshot, so we need the extra !root->fs_info->tree_root check to
-	 * make sure we don't drop it.
-	 */
-	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root) {
+	if (empty && btrfs_root_refs(&root->root_item) == 0) {
 		synchronize_srcu(&root->fs_info->subvol_srcu);
 		spin_lock(&root->inode_lock);
 		empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -7857,8 +7853,7 @@ int btrfs_drop_inode(struct inode *inode)
 		return 1;
 
 	/* the snap/subvol tree is on deleting */
-	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    root != root->fs_info->tree_root)
+	if (btrfs_root_refs(&root->root_item) == 0)
 		return 1;
 	else
 		return generic_drop_inode(inode);

From f06becc4119856c984e3beef54aa75538e656c6d Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 16 Sep 2013 09:53:28 +0100
Subject: [PATCH 02/98] Btrfs: don't store NULL byte in symlink extents

It is not necessary to store the NULL byte in a symlink inline file
extent. There's currently no code that requires the NULL byte to be
present in the extent. This change also doesn't break file format
compatibility nor the send/receive feature.

The VFS also doesn't need the NULL byte to be present in the extent,
as it reads up to inode->i_size bytes (which already excluded the NULL
byte) and sets the NULL byte for us (in fs/namei.c:page_getlink()).

So with this change we save 1 byte per symlink file extent (which is
always inlined in the btree leaf) without losing backward and forward
compatibility.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dfc60aa40332..59b4e2e495ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8339,7 +8339,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct btrfs_file_extent_item *ei;
 	struct extent_buffer *leaf;
 
-	name_len = strlen(symname) + 1;
+	name_len = strlen(symname);
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
@@ -8427,7 +8427,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode_set_bytes(inode, name_len);
-	btrfs_i_size_write(inode, name_len - 1);
+	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		drop_inode = 1;

From dd3cc16b8750251ea9b1a843ce7806e82b015d5e Mon Sep 17 00:00:00 2001
From: Ross Kirk <ross.kirk@gmail.com>
Date: Mon, 16 Sep 2013 15:58:09 +0100
Subject: [PATCH 03/98] btrfs: drop unused parameter from btrfs_item_nr

Remove unused eb parameter from btrfs_item_nr

Signed-off-by: Ross Kirk <ross.kirk@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c    |  2 +-
 fs/btrfs/ctree.c      | 34 +++++++++++++++++-----------------
 fs/btrfs/ctree.h      | 13 ++++++-------
 fs/btrfs/dir-item.c   |  2 +-
 fs/btrfs/inode-item.c |  2 +-
 fs/btrfs/inode.c      |  4 ++--
 fs/btrfs/print-tree.c |  2 +-
 fs/btrfs/send.c       |  4 ++--
 8 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 0552a599b28f..721936a5b2ca 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1619,7 +1619,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
-		item = btrfs_item_nr(eb, slot);
+		item = btrfs_item_nr(slot);
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 61b5bcd57b7e..c274a752c93d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3337,8 +3337,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 	if (!nr)
 		return 0;
 	btrfs_init_map_token(&token);
-	start_item = btrfs_item_nr(l, start);
-	end_item = btrfs_item_nr(l, end);
+	start_item = btrfs_item_nr(start);
+	end_item = btrfs_item_nr(end);
 	data_len = btrfs_token_item_offset(l, start_item, &token) +
 		btrfs_token_item_size(l, start_item, &token);
 	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
@@ -3406,7 +3406,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	slot = path->slots[1];
 	i = left_nritems - 1;
 	while (i >= nr) {
-		item = btrfs_item_nr(left, i);
+		item = btrfs_item_nr(i);
 
 		if (!empty && push_items > 0) {
 			if (path->slots[0] > i)
@@ -3470,7 +3470,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(right, i);
+		item = btrfs_item_nr(i);
 		push_space -= btrfs_token_item_size(right, item, &token);
 		btrfs_set_token_item_offset(right, item, push_space, &token);
 	}
@@ -3612,7 +3612,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		nr = min(right_nritems - 1, max_slot);
 
 	for (i = 0; i < nr; i++) {
-		item = btrfs_item_nr(right, i);
+		item = btrfs_item_nr(i);
 
 		if (!empty && push_items > 0) {
 			if (path->slots[0] < i)
@@ -3663,7 +3663,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
 
-		item = btrfs_item_nr(left, i);
+		item = btrfs_item_nr(i);
 
 		ioff = btrfs_token_item_offset(left, item, &token);
 		btrfs_set_token_item_offset(left, item,
@@ -3694,7 +3694,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		item = btrfs_item_nr(right, i);
+		item = btrfs_item_nr(i);
 
 		push_space = push_space - btrfs_token_item_size(right,
 								item, &token);
@@ -3835,7 +3835,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
 		      btrfs_item_end_nr(l, mid);
 
 	for (i = 0; i < nritems; i++) {
-		struct btrfs_item *item = btrfs_item_nr(right, i);
+		struct btrfs_item *item = btrfs_item_nr(i);
 		u32 ioff;
 
 		ioff = btrfs_token_item_offset(right, item, &token);
@@ -4177,7 +4177,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_path_blocking(path);
 
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	orig_offset = btrfs_item_offset(leaf, item);
 	item_size = btrfs_item_size(leaf, item);
 
@@ -4200,7 +4200,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(leaf, &disk_key, slot);
 
-	new_item = btrfs_item_nr(leaf, slot);
+	new_item = btrfs_item_nr(slot);
 
 	btrfs_set_item_offset(leaf, new_item, orig_offset);
 	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
@@ -4339,7 +4339,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(leaf, i);
+		item = btrfs_item_nr(i);
 
 		ioff = btrfs_token_item_offset(leaf, item, &token);
 		btrfs_set_token_item_offset(leaf, item,
@@ -4387,7 +4387,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
 			fixup_low_keys(root, path, &disk_key, 1);
 	}
 
-	item = btrfs_item_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
 	btrfs_set_item_size(leaf, item, new_size);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -4441,7 +4441,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
-		item = btrfs_item_nr(leaf, i);
+		item = btrfs_item_nr(i);
 
 		ioff = btrfs_token_item_offset(leaf, item, &token);
 		btrfs_set_token_item_offset(leaf, item,
@@ -4455,7 +4455,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
 
 	data_end = old_data;
 	old_size = btrfs_item_size_nr(leaf, slot);
-	item = btrfs_item_nr(leaf, slot);
+	item = btrfs_item_nr(slot);
 	btrfs_set_item_size(leaf, item, old_size + data_size);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -4514,7 +4514,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 		for (i = slot; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(leaf, i);
+			item = btrfs_item_nr( i);
 			ioff = btrfs_token_item_offset(leaf, item, &token);
 			btrfs_set_token_item_offset(leaf, item,
 						    ioff - total_data, &token);
@@ -4535,7 +4535,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
 	for (i = 0; i < nr; i++) {
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
+		item = btrfs_item_nr(slot + i);
 		btrfs_set_token_item_offset(leaf, item,
 					    data_end - data_size[i], &token);
 		data_end -= data_size[i];
@@ -4730,7 +4730,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
-			item = btrfs_item_nr(leaf, i);
+			item = btrfs_item_nr(i);
 			ioff = btrfs_token_item_offset(leaf, item, &token);
 			btrfs_set_token_item_offset(leaf, item,
 						    ioff + dsize, &token);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0506f40ede83..fa117f7d5a8f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2461,8 +2461,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr)
 		sizeof(struct btrfs_item) * nr;
 }
 
-static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
-					       int nr)
+static inline struct btrfs_item *btrfs_item_nr(int nr)
 {
 	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
@@ -2475,30 +2474,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb,
 
 static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_end(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_offset(eb, btrfs_item_nr(nr));
 }
 
 static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
 {
-	return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+	return btrfs_item_size(eb, btrfs_item_nr(nr));
 }
 
 static inline void btrfs_item_key(struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	struct btrfs_item *item = btrfs_item_nr(nr);
 	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
 static inline void btrfs_set_item_key(struct extent_buffer *eb,
 			       struct btrfs_disk_key *disk_key, int nr)
 {
-	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	struct btrfs_item *item = btrfs_item_nr(nr);
 	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 79e594e341c7..1c529db1697f 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -58,7 +58,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
 	BUG_ON(data_size > btrfs_item_size(leaf, item));
 	ptr += btrfs_item_size(leaf, item) - data_size;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index e0b7034d6343..ec82fae07097 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -369,7 +369,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
 		goto out;
 
 	leaf = path->nodes[0];
-	item = btrfs_item_nr(leaf, path->slots[0]);
+	item = btrfs_item_nr(path->slots[0]);
 	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
 	ptr += btrfs_item_size(leaf, item) - ins_len;
 	extref = (struct btrfs_inode_extref *)ptr;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59b4e2e495ac..57b27012d43a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5044,7 +5044,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 			continue;
 		}
 
-		item = btrfs_item_nr(leaf, slot);
+		item = btrfs_item_nr(slot);
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		if (found_key.objectid != key.objectid)
@@ -5856,7 +5856,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 	compress_type = btrfs_file_extent_compression(leaf, item);
 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
 	inline_size = btrfs_file_extent_inline_item_len(leaf,
-					btrfs_item_nr(leaf, path->slots[0]));
+					btrfs_item_nr(path->slots[0]));
 	tmp = kmalloc(inline_size, GFP_NOFS);
 	if (!tmp)
 		return -ENOMEM;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0088bedc8631..417053b17181 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -193,7 +193,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
 		   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
-		item = btrfs_item_nr(l, i);
+		item = btrfs_item_nr(i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
 		printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e46e0ed74925..0a894399be16 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -791,7 +791,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 	if (found_key->type == BTRFS_INODE_REF_KEY) {
 		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
 						    struct btrfs_inode_ref);
-		item = btrfs_item_nr(eb, slot);
+		item = btrfs_item_nr(slot);
 		total = btrfs_item_size(eb, item);
 		elem_size = sizeof(*iref);
 	} else {
@@ -905,7 +905,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 
 	eb = path->nodes[0];
 	slot = path->slots[0];
-	item = btrfs_item_nr(eb, slot);
+	item = btrfs_item_nr(slot);
 	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
 	cur = 0;
 	len = 0;

From 06ea65a398a2501e94beee3a425d07e1846ff25a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 19 Sep 2013 16:07:01 -0400
Subject: [PATCH 04/98] Btrfs: add a sanity test for btrfs_split_item

While looking at somebodys corruption I became completely convinced that
btrfs_split_item was broken, so I wrote this test to verify that it was working
as it was supposed to.  Thankfully it appears to be working as intended, so just
add this test to make sure nobody breaks it in the future.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile                    |   3 +-
 fs/btrfs/ctree.h                     |   4 +-
 fs/btrfs/disk-io.c                   |  38 ++++-
 fs/btrfs/disk-io.h                   |   4 +
 fs/btrfs/super.c                     |   7 +-
 fs/btrfs/tests/btrfs-tests.h         |   5 +
 fs/btrfs/tests/extent-buffer-tests.c | 229 +++++++++++++++++++++++++++
 7 files changed, 283 insertions(+), 7 deletions(-)
 create mode 100644 fs/btrfs/tests/extent-buffer-tests.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a91a6a355cc5..4c7dfbfaa3b3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -14,4 +14,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
-btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+	tests/extent-buffer-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa117f7d5a8f..ee0b8aa82828 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1724,7 +1724,9 @@ struct btrfs_root {
 	int ref_cows;
 	int track_dirty;
 	int in_radix;
-
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	int dummy_root;
+#endif
 	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f724397b396b..db2095486a4f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1229,14 +1229,18 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->refs, 1);
 	root->log_transid = 0;
 	root->last_log_commit = 0;
-	extent_io_tree_init(&root->dirty_log_pages,
-			     fs_info->btree_inode->i_mapping);
+	if (fs_info)
+		extent_io_tree_init(&root->dirty_log_pages,
+				     fs_info->btree_inode->i_mapping);
 
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
-	root->defrag_trans_start = fs_info->generation;
+	if (fs_info)
+		root->defrag_trans_start = fs_info->generation;
+	else
+		root->defrag_trans_start = 0;
 	init_completion(&root->kobj_unregister);
 	root->defrag_running = 0;
 	root->root_key.objectid = objectid;
@@ -1253,6 +1257,22 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 	return root;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_alloc_root(NULL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+	__setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+	root->dummy_root = 1;
+
+	return root;
+}
+#endif
+
 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 				     struct btrfs_fs_info *fs_info,
 				     u64 objectid)
@@ -3670,10 +3690,20 @@ int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root;
 	u64 transid = btrfs_header_generation(buf);
 	int was_dirty;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+	/*
+	 * This is a fast path so only do this check if we have sanity tests
+	 * enabled.  Normal people shouldn't be marking dummy buffers as dirty
+	 * outside of the sanity tests.
+	 */
+	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+		return;
+#endif
+	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
 	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5ce2a7da8b11..53059df350f8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -86,6 +86,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
 				 struct btrfs_root *root);
 void btrfs_free_fs_root(struct btrfs_root *root);
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
 /*
  * This function is used to grab the root, and avoid it is freed when we
  * access it. But it doesn't ensure that the tree is not dropped.
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e913328d0f2a..0e398657d759 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1789,7 +1789,12 @@ static void btrfs_print_info(void)
 
 static int btrfs_run_sanity_tests(void)
 {
-	return btrfs_test_free_space_cache();
+	int ret;
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		return ret;
+	return btrfs_test_extent_buffer_operations();
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 580877625776..04f2cd2ca568 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,11 +24,16 @@
 #define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
 
 int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
 	return 0;
 }
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 000000000000..cc286ce97d1e
--- /dev/null
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	char *value = "mary had a little lamb";
+	char *split1 = "mary had a little";
+	char *split2 = " lamb";
+	char *split3 = "mary";
+	char *split4 = " had a little";
+	char buf[32];
+	struct btrfs_key key;
+	u32 value_len = strlen(value);
+	int ret = 0;
+
+	test_msg("Running btrfs_split_item tests\n");
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Could not allocate root\n");
+		return PTR_ERR(root);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Could not allocate path\n");
+		kfree(root);
+		return -ENOMEM;
+	}
+
+	path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+	if (!eb) {
+		test_msg("Could not allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	path->slots[0] = 0;
+
+	key.objectid = 0;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	item = btrfs_item_nr(0);
+	write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+			    value_len);
+
+	key.offset = 3;
+
+	/*
+	 * Passing NULL trans here should be safe because we have plenty of
+	 * space in this leaf to split the item without having to split the
+	 * leaf.
+	 */
+	ret = btrfs_split_item(NULL, root, path, &key, 17);
+	if (ret) {
+		test_msg("Split item failed %d\n", ret);
+		goto out;
+	}
+
+	/*
+	 * Read the first slot, it should have the original key and contain only
+	 * 'mary had a little'
+	 */
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split1)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split1));
+	if (memcmp(buf, split1, strlen(split1))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the first split have='%.*s' want '%s'\n",
+			 (int)strlen(split1), buf, split1);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key.offset = 1;
+	/* Do it again so we test memmoving the other items in the leaf */
+	ret = btrfs_split_item(NULL, root, path, &key, 4);
+	if (ret) {
+		test_msg("Second split item failed %d\n", ret);
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 0);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 0) {
+		test_msg("Invalid key at slot 0\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(0);
+	if (btrfs_item_size(eb, item) != strlen(split3)) {
+		test_msg("Invalid len in the first split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+			   strlen(split3));
+	if (memcmp(buf, split3, strlen(split3))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the third split");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 1);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 1) {
+		test_msg("Invalid key at slot 1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(1);
+	if (btrfs_item_size(eb, item) != strlen(split4)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+			   strlen(split4));
+	if (memcmp(buf, split4, strlen(split4))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the fourth split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_item_key_to_cpu(eb, &key, 2);
+	if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+	    key.offset != 3) {
+		test_msg("Invalid key at slot 2\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	item = btrfs_item_nr(2);
+	if (btrfs_item_size(eb, item) != strlen(split2)) {
+		test_msg("Invalid len in the second split\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+			   strlen(split2));
+	if (memcmp(buf, split2, strlen(split2))) {
+		test_msg("Data in the buffer doesn't match what it should "
+			 "in the last chunk\n");
+		ret = -EINVAL;
+		goto out;
+	}
+out:
+	btrfs_free_path(path);
+	kfree(root);
+	return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+	test_msg("Running extent buffer operation tests");
+	return test_btrfs_split_item();
+}

From d4b4087c43cc00a196c5be57fac41f41309f1d56 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 24 Sep 2013 14:09:34 -0400
Subject: [PATCH 05/98] Btrfs: do a full search everytime in
 btrfs_search_old_slot

While running some snashot aware defrag tests I noticed I was panicing every
once and a while in key_search.  This is because of the optimization that says
if we find a key at slot 0 it will be at slot 0 all the way down the rest of the
tree.  This isn't the case for btrfs_search_old_slot since it will likely replay
changes to a buffer if something has changed since we took our sequence number.
So short circuit this optimization by setting prev_cmp to -1 every time we call
key_search so we will do our normal binary search.  With this patch I am no
longer seeing the panics I was seeing before.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c274a752c93d..6932686b61cf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2758,7 +2758,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	int level;
 	int lowest_unlock = 1;
 	u8 lowest_level = 0;
-	int prev_cmp;
+	int prev_cmp = -1;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(p->nodes[0] != NULL);
@@ -2769,7 +2769,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 again:
-	prev_cmp = -1;
 	b = get_old_root(root, time_seq);
 	level = btrfs_header_level(b);
 	p->locks[level] = BTRFS_READ_LOCK;
@@ -2787,6 +2786,11 @@ again:
 		 */
 		btrfs_unlock_up_safe(p, level + 1);
 
+		/*
+		 * Since we can unwind eb's we want to do a real search every
+		 * time.
+		 */
+		prev_cmp = -1;
 		ret = key_search(b, key, level, &prev_cmp, &slot);
 
 		if (level != 0) {

From 53645a91f4baed059ec5dfb22340cf3e6b8fdd2c Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 20 Sep 2013 14:43:28 +0100
Subject: [PATCH 06/98] Btrfs: remove duplicated ino cache's inode lookup

We're doing a unnecessary extra lookup of the ino cache's
inode when we already have it (and holding a reference)
during the process of saving the ino cache contents to disk.
Therefore remove this extra lookup.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/free-space-cache.c | 9 ++-------
 fs/btrfs/free-space-cache.h | 3 ++-
 fs/btrfs/inode-map.c        | 2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b4f9904c4c6b..9a1e371a6eba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2967,19 +2967,15 @@ out:
 
 int btrfs_write_out_ino_cache(struct btrfs_root *root,
 			      struct btrfs_trans_handle *trans,
-			      struct btrfs_path *path)
+			      struct btrfs_path *path,
+			      struct inode *inode)
 {
 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
-	struct inode *inode;
 	int ret;
 
 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
 		return 0;
 
-	inode = lookup_free_ino_inode(root, path);
-	if (IS_ERR(inode))
-		return 0;
-
 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
@@ -2990,7 +2986,6 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 #endif
 	}
 
-	iput(inode);
 	return ret;
 }
 
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e737f92cf6d0..ba8d5a1b64b5 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -76,7 +76,8 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info,
 			struct btrfs_root *root);
 int btrfs_write_out_ino_cache(struct btrfs_root *root,
 			      struct btrfs_trans_handle *trans,
-			      struct btrfs_path *path);
+			      struct btrfs_path *path,
+			      struct inode *inode);
 
 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d11e1c6610b6..b863e45d373b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -503,7 +503,7 @@ again:
 	}
 	btrfs_free_reserved_data_space(inode, prealloc);
 
-	ret = btrfs_write_out_ino_cache(root, trans, path);
+	ret = btrfs_write_out_ino_cache(root, trans, path, inode);
 out_put:
 	iput(inode);
 out_release:

From 74514323947ef27347564bfd7a663fdb3429cb20 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 20 Sep 2013 14:46:51 +0100
Subject: [PATCH 07/98] Btrfs: remove path arg from
 btrfs_truncate_free_space_cache

Not used for anything, and removing it avoids caller's need to
allocate a path structure.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c      |  3 +--
 fs/btrfs/free-space-cache.c |  1 -
 fs/btrfs/free-space-cache.h |  1 -
 fs/btrfs/inode-map.c        |  2 +-
 fs/btrfs/relocation.c       | 11 +----------
 5 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d58bef130a41..55b537fcb81a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3197,8 +3197,7 @@ again:
 		if (ret)
 			goto out_put;
 
-		ret = btrfs_truncate_free_space_cache(root, trans, path,
-						      inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, inode);
 		if (ret)
 			goto out_put;
 	}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a1e371a6eba..4772f3a8e7e5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -218,7 +218,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
 				    struct inode *inode)
 {
 	int ret = 0;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ba8d5a1b64b5..0cf4977ef70d 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -58,7 +58,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
 				    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
 			  struct btrfs_block_group_cache *block_group);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b863e45d373b..014de497e888 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -466,7 +466,7 @@ again:
 	}
 
 	if (i_size_read(inode) > 0) {
-		ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+		ret = btrfs_truncate_free_space_cache(root, trans, inode);
 		if (ret) {
 			if (ret != -ENOSPC)
 				btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4a355726151e..dec4f5aead0f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3407,7 +3407,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
 				    struct inode *inode, u64 ino)
 {
 	struct btrfs_key key;
-	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
@@ -3432,22 +3431,14 @@ truncate:
 	if (ret)
 		goto out;
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
 		ret = PTR_ERR(trans);
 		goto out;
 	}
 
-	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+	ret = btrfs_truncate_free_space_cache(root, trans, inode);
 
-	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
 out:

From fe09e16cc8d444ecc52f6f9a651946f16fffa4e1 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Sun, 22 Sep 2013 12:54:23 +0800
Subject: [PATCH 08/98] Btrfs: export btrfs space shared info to userspace

Similar to ocfs2, btrfs also supports that extents can be shared by
different inodes, and there are some userspace tools requesting
for this kind of 'space shared infomation'.[1]

ocfs2 uses flag FIEMAP_EXTENT_SHARED, so does btrfs.

[1]: http://thr3ads.net/ocfs2-devel/2010/09/489052-PATCH-3-3-shared-du-using-fiemap-to-figure-up-the-shared-extents-per-file-and-the-footprint-in

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_io.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 51731b76900d..2bf6f46fae44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -20,6 +20,7 @@
 #include "check-integrity.h"
 #include "locking.h"
 #include "rcu-string.h"
+#include "backref.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -4062,6 +4063,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	return NULL;
 }
 
+static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
+{
+	unsigned long cnt = *((unsigned long *)ctx);
+
+	cnt++;
+	*((unsigned long *)ctx) = cnt;
+
+	/* Now we're sure that the extent is shared. */
+	if (cnt > 1)
+		return 1;
+	return 0;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -4128,7 +4142,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		last = found_key.offset;
 		last_for_get_extent = last + 1;
 	}
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 
 	/*
 	 * we might have some extents allocated but more delalloc past those
@@ -4198,7 +4212,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
+			unsigned long ref_cnt = 0;
+
 			disko = em->block_start + offset_in_extent;
+
+			/*
+			 * As btrfs supports shared space, this information
+			 * can be exported to userspace tools via
+			 * flag FIEMAP_EXTENT_SHARED.
+			 */
+			ret = iterate_inodes_from_logical(
+					em->block_start,
+					BTRFS_I(inode)->root->fs_info,
+					path, count_ext_ref, &ref_cnt);
+			if (ret < 0 && ret != -ENOENT)
+				goto out_free;
+
+			if (ref_cnt > 1)
+				flags |= FIEMAP_EXTENT_SHARED;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
@@ -4230,6 +4261,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
 	free_extent_map(em);
 out:
+	btrfs_free_path(path);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			     &cached_state, GFP_NOFS);
 	return ret;

From 703c88e035242202e3ab48fcbbbe0a7bc62fb7bb Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 22 Sep 2013 21:54:55 +0100
Subject: [PATCH 09/98] Btrfs: fix tracking of orphan inode count

In inode.c:btrfs_orphan_add() if we failed to insert the orphan
item, we would return without decrementing the orphan count that
we just incremented before attempting the insertion, leaving the
orphan inode count wrong.

In inode.c:btrfs_orphan_del(), we were decrementing the inode
orphan count if the bit BTRFS_INODE_ORPHAN_META_RESERVED was set,
which is logically wrong because it should be decremented if the
bit BTRFS_INODE_HAS_ORPHAN_ITEM was set - after all we increment
the count when we set the bit BTRFS_INODE_HAS_ORPHAN_ITEM elsewhere.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 57b27012d43a..939dd1ff34a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2969,6 +2969,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	if (insert >= 1) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
 		if (ret) {
+			atomic_dec(&root->orphan_inodes);
 			if (reserve) {
 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 					  &BTRFS_I(inode)->runtime_flags);
@@ -3018,14 +3019,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 		release_rsv = 1;
 	spin_unlock(&root->orphan_lock);
 
-	if (trans && delete_item)
-		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
-	if (release_rsv) {
-		btrfs_orphan_release_metadata(inode);
+	if (delete_item) {
 		atomic_dec(&root->orphan_inodes);
+		if (trans)
+			ret = btrfs_del_orphan_item(trans, root,
+						    btrfs_ino(inode));
 	}
 
+	if (release_rsv)
+		btrfs_orphan_release_metadata(inode);
+
 	return ret;
 }
 

From 9b1998598625fb5b798e8291cafda1a8ec17c1bd Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 23 Sep 2013 11:35:11 +0100
Subject: [PATCH 10/98] Btrfs: fix sync fs to actually wait for all data to be
 persisted

Currently the fs sync function (super.c:btrfs_sync_fs()) doesn't
wait for delayed work to finish before returning success to the
caller. This change fixes this, ensuring that there's no data loss
if a power failure happens right after fs sync returns success to
the caller and before the next commit happens.

Steps to reproduce the data loss issue:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ perl -e '$d = ("\x41" x 6001); open($f,">","/mnt/btrfs/foobar"); print $f $d; close($f);' && btrfs fi sync /mnt/btrfs

Right after the btrfs fi sync command (a second or 2 for example), power
off the machine and reboot it. The file will be empty, as it can be verified
after mounting the filesystem and through btrfs-debug-tree:

$ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8
        item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36
                location key (257 INODE_ITEM 0) type FILE
                namelen 6 datalen 0 name: foobar
        item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160
                inode generation 7 transid 7 size 0 block group 0 mode 100644 links 1
        item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16
                inode ref index 2 namelen 6 name: foobar
checksum tree key (CSUM_TREE ROOT_ITEM 0)
leaf 29429760 items 0 free space 3995 generation 7 owner 7
fs uuid 6192815c-af2a-4b75-b3db-a959ffb6166e
chunk uuid b529c44b-938c-4d3d-910a-013b4700bcae
uuid tree key (UUID_TREE ROOT_ITEM 0)

After this patch, the data loss no longer happens after a power failure and
btrfs-debug-tree shows:

$ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8
	item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36
		location key (257 INODE_ITEM 0) type FILE
		namelen 6 datalen 0 name: foobar
	item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160
		inode generation 6 transid 6 size 6001 block group 0 mode 100644 links 1
	item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16
		inode ref index 2 namelen 6 name: foobar
	item 6 key (257 EXTENT_DATA 0) itemoff 3522 itemsize 53
		extent data disk byte 12845056 nr 8192
		extent data offset 0 nr 8192 ram 8192
		extent compression 0
checksum tree key (CSUM_TREE ROOT_ITEM 0)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9d46f60cb943..385c58f2bba1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4557,9 +4557,15 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_logical_to_ino(root, argp);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
-	case BTRFS_IOC_SYNC:
-		btrfs_sync_fs(file->f_dentry->d_sb, 1);
-		return 0;
+	case BTRFS_IOC_SYNC: {
+		int ret;
+
+		ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		if (ret)
+			return ret;
+		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return ret;
+	}
 	case BTRFS_IOC_START_SYNC:
 		return btrfs_ioctl_start_sync(root, argp);
 	case BTRFS_IOC_WAIT_SYNC:

From e84cc14213e2c81ae5a2da341a9da0d58a1dbfad Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 9 Sep 2013 19:49:43 +0100
Subject: [PATCH 11/98] Btrfs: don't leak block group on error

In extent-tree.c:btrfs_write_dirty_block_groups(), if the call to
write_one_cache_group() failed, we would return without putting
the block group first.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 55b537fcb81a..54ee542c705e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3317,10 +3317,9 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 
 		err = write_one_cache_group(trans, root, path, cache);
+		btrfs_put_block_group(cache);
 		if (err) /* File system offline */
 			goto out;
-
-		btrfs_put_block_group(cache);
 	}
 
 	while (1) {

From 4577b014d1bc3db386da3246f625888fc48083a9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 09:33:09 -0400
Subject: [PATCH 12/98] Btrfs: relocate csums properly with prealloc extents

A user reported a problem where they were getting csum errors when running a
balance and running systemd's journal.  This is because systemd is awesome and
fallocate()'s its log space and writes into it.  Unfortunately we assume that
when we read in all the csums for an extent that they are sequential starting at
the bytenr we care about.  This obviously isn't the case for prealloc extents,
where we could have written to the middle of the prealloc extent only, which
means the csum would be for the bytenr in the middle of our range and not the
front of our range.  Fix this by offsetting the new bytenr we are logging to
based on the original bytenr the csum was for.  With this patch I no longer see
the csum errors I was seeing.  Thanks,

Cc: stable@vger.kernel.org
Reported-by: Chris Murphy <lists@colorremedies.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index dec4f5aead0f..0359eecac95d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4472,6 +4472,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	u64 disk_bytenr;
+	u64 new_bytenr;
 	LIST_HEAD(list);
 
 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
@@ -4483,13 +4484,24 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	if (ret)
 		goto out;
 
-	disk_bytenr = ordered->start;
 	while (!list_empty(&list)) {
 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 		list_del_init(&sums->list);
 
-		sums->bytenr = disk_bytenr;
-		disk_bytenr += sums->len;
+		/*
+		 * We need to offset the new_bytenr based on where the csum is.
+		 * We need to do this because we will read in entire prealloc
+		 * extents but we may have written to say the middle of the
+		 * prealloc extent, so we need to make sure the csum goes with
+		 * the right disk offset.
+		 *
+		 * We can do this because the data reloc inode refers strictly
+		 * to the on disk bytes, so we don't have to worry about
+		 * disk_len vs real len like with real inodes since it's all
+		 * disk length.
+		 */
+		new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+		sums->bytenr = new_bytenr;
 
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}

From e0228285a8cad70e4b7b4833cc650e36ecd8de89 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 20 Sep 2013 22:26:29 -0400
Subject: [PATCH 13/98] Btrfs: reset intwrite on transaction abort

If we abort a transaction in the middle of a commit we weren't undoing the
intwrite locking.  This patch fixes that problem.

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8c81bdc1ef9b..7138d6a3726e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1552,6 +1552,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 		root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->trans_lock);
 
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 

From b6d08f0630d51ec09d67f16f6d7839699bbc0402 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 14:57:43 -0400
Subject: [PATCH 14/98] Btrfs: do not release metadata for space cache inodes

I've been testing our error paths and I was tripping the BUG_ON() in
drop_outstanding_extent because our outstanding_extents is 0 for space cache
inodes.  This is because we don't reserve metadata space for these inodes since
we depend on the global block reserve for our space.  To fix this we need to
make sure the DO_ACCOUNTING stuff doesn't actually call release_metadata for
space cache inodes.  With this patch I'm no longer panicing.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 939dd1ff34a7..c0dcb4c3d72d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1551,7 +1551,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		if (*bits & EXTENT_DO_ACCOUNTING)
+		/*
+		 * We don't reserve metadata space for space cache inodes so we
+		 * don't need to call dellalloc_release_metadata if there is an
+		 * error.
+		 */
+		if (*bits & EXTENT_DO_ACCOUNTING &&
+		    root != root->fs_info->tree_root)
 			btrfs_delalloc_release_metadata(inode, len);
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID

From 4e121c06adf53aae478ebce3035116595d063413 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 16:32:39 -0400
Subject: [PATCH 15/98] Btrfs: cleanup transaction on abort

If we abort not during a transaction commit we won't clean up anything until we
unmount.  Unfortunately if we abort in the middle of writing out an ordered
extent we won't clean it up and if somebody is waiting on that ordered extent
they will wait forever.  To fix this just make the transaction kthread call the
cleanup transaction stuff if it notices theres an error, and make
btrfs_end_transaction wake up the transaction kthread if there is an error.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c     | 3 +++
 fs/btrfs/transaction.c | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index db2095486a4f..113cd43530ba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1800,6 +1800,9 @@ sleep:
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
+		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+				      &root->fs_info->fs_state)))
+			btrfs_cleanup_transaction(root);
 		if (!try_to_freeze()) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop() &&
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7138d6a3726e..a1343e81c59d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -744,8 +744,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		btrfs_run_delayed_iputs(root);
 
 	if (trans->aborted ||
-	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+		wake_up_process(info->transaction_kthread);
 		err = -EIO;
+	}
 	assert_qgroups_uptodate(trans);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);

From 1de2cfde93c20a0357ff1dffed901598470facf3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 16:36:02 -0400
Subject: [PATCH 16/98] Btrfs: don't delete ordered roots from list during
 cleanup

During transaction cleanup after an abort we are just removing roots from the
ordered roots list which is incorrect.  We have a BUG_ON() to make sure that the
root is still part of the ordered roots list when we put our ordered extent
which we were tripping in this case.  So do like we do everywhere else and just
move it to the tail of the ordered roots list and allow the normal cleanup to
take care of stuff.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 113cd43530ba..01a26e2eb310 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3836,7 +3836,8 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 	while (!list_empty(&splice)) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					ordered_root);
-		list_del_init(&root->ordered_root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
 
 		btrfs_destroy_ordered_extents(root);
 

From c16ce1901431629fbe5b9387cc966d62a089e4df Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 27 Sep 2013 16:38:20 -0400
Subject: [PATCH 17/98] Btrfs: remove all BUG_ON()'s from commit_cowonly_roots

Noticed this when forcing errors to happen during delayed ref running.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a1343e81c59d..f08e22885c21 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -950,16 +950,19 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 		return ret;
 
 	ret = btrfs_run_dev_stats(trans, root->fs_info);
-	WARN_ON(ret);
+	if (ret)
+		return ret;
 	ret = btrfs_run_dev_replace(trans, root->fs_info);
-	WARN_ON(ret);
-
+	if (ret)
+		return ret;
 	ret = btrfs_run_qgroups(trans, root->fs_info);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	/* run_qgroups might have added some more refs */
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;

From 724e2315db3d59a8201d4a87c7c7a873e60e1ce0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 30 Sep 2013 11:36:38 -0400
Subject: [PATCH 18/98] Btrfs: fix two use-after-free bugs with transaction
 cleanup

I was noticing the slab redzone stuff going off every once and a while during
transaction aborts.  This was caused by two things

1) We would walk the pending snapshots and set their error to -ECANCELED.  We
don't need to do this, the snapshot stuff waits for a transaction commit and if
there is a problem we just free our pending snapshot object and exit.  Doing
this was causing us to touch the pending snapshot object after the thing had
already been freed.

2) We were freeing the transaction manually with wanton disregard for it's
use_count reference counter.  To fix this I cleaned up the transaction freeing
loop to either wait for the transaction commit to finish if it was in the middle
of that (since it will be cleaned and freed up there) or to do the cleanup
oursevles.

I also moved the global "kill all things dirty everywhere" stuff outside of the
transaction cleanup loop since that only needs to be done once.  With this patch
I'm no longer seeing slab corruption because of use after frees.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c     | 111 +++++++++++++++--------------------------
 fs/btrfs/transaction.c |  22 ++++----
 fs/btrfs/transaction.h |   1 +
 3 files changed, 52 insertions(+), 82 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 01a26e2eb310..b131d716d5a5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -64,7 +64,6 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_root *root);
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages,
@@ -3915,24 +3914,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 	return ret;
 }
 
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
-{
-	struct btrfs_pending_snapshot *snapshot;
-	struct list_head splice;
-
-	INIT_LIST_HEAD(&splice);
-
-	list_splice_init(&t->pending_snapshots, &splice);
-
-	while (!list_empty(&splice)) {
-		snapshot = list_entry(splice.next,
-				      struct btrfs_pending_snapshot,
-				      list);
-		snapshot->error = -ECANCELED;
-		list_del_init(&snapshot->list);
-	}
-}
-
 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
@@ -4062,6 +4043,8 @@ again:
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 				   struct btrfs_root *root)
 {
+	btrfs_destroy_ordered_operations(cur_trans, root);
+
 	btrfs_destroy_delayed_refs(cur_trans, root);
 	btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
 				cur_trans->dirty_pages.dirty_bytes);
@@ -4069,8 +4052,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
-	btrfs_evict_pending_snapshots(cur_trans);
-
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	wake_up(&root->fs_info->transaction_wait);
 
@@ -4094,63 +4075,51 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 static int btrfs_cleanup_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *t;
-	LIST_HEAD(list);
 
 	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 	spin_lock(&root->fs_info->trans_lock);
-	list_splice_init(&root->fs_info->trans_list, &list);
-	root->fs_info->running_transaction = NULL;
-	spin_unlock(&root->fs_info->trans_lock);
+	while (!list_empty(&root->fs_info->trans_list)) {
+		t = list_first_entry(&root->fs_info->trans_list,
+				     struct btrfs_transaction, list);
+		if (t->state >= TRANS_STATE_COMMIT_START) {
+			atomic_inc(&t->use_count);
+			spin_unlock(&root->fs_info->trans_lock);
+			btrfs_wait_for_commit(root, t->transid);
+			btrfs_put_transaction(t);
+			spin_lock(&root->fs_info->trans_lock);
+			continue;
+		}
+		if (t == root->fs_info->running_transaction) {
+			t->state = TRANS_STATE_COMMIT_DOING;
+			spin_unlock(&root->fs_info->trans_lock);
+			/*
+			 * We wait for 0 num_writers since we don't hold a trans
+			 * handle open currently for this transaction.
+			 */
+			wait_event(t->writer_wait,
+				   atomic_read(&t->num_writers) == 0);
+		} else {
+			spin_unlock(&root->fs_info->trans_lock);
+		}
+		btrfs_cleanup_one_transaction(t, root);
 
-	while (!list_empty(&list)) {
-		t = list_entry(list.next, struct btrfs_transaction, list);
-
-		btrfs_destroy_ordered_operations(t, root);
-
-		btrfs_destroy_all_ordered_extents(root->fs_info);
-
-		btrfs_destroy_delayed_refs(t, root);
-
-		/*
-		 *  FIXME: cleanup wait for commit
-		 *  We needn't acquire the lock here, because we are during
-		 *  the umount, there is no other task which will change it.
-		 */
-		t->state = TRANS_STATE_COMMIT_START;
-		smp_mb();
-		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
-			wake_up(&root->fs_info->transaction_blocked_wait);
-
-		btrfs_evict_pending_snapshots(t);
-
-		t->state = TRANS_STATE_UNBLOCKED;
-		smp_mb();
-		if (waitqueue_active(&root->fs_info->transaction_wait))
-			wake_up(&root->fs_info->transaction_wait);
-
-		btrfs_destroy_delayed_inodes(root);
-		btrfs_assert_delayed_root_empty(root);
-
-		btrfs_destroy_all_delalloc_inodes(root->fs_info);
-
-		btrfs_destroy_marked_extents(root, &t->dirty_pages,
-					     EXTENT_DIRTY);
-
-		btrfs_destroy_pinned_extent(root,
-					    root->fs_info->pinned_extents);
-
-		t->state = TRANS_STATE_COMPLETED;
-		smp_mb();
-		if (waitqueue_active(&t->commit_wait))
-			wake_up(&t->commit_wait);
-
-		atomic_set(&t->use_count, 0);
+		spin_lock(&root->fs_info->trans_lock);
+		if (t == root->fs_info->running_transaction)
+			root->fs_info->running_transaction = NULL;
 		list_del_init(&t->list);
-		memset(t, 0, sizeof(*t));
-		kmem_cache_free(btrfs_transaction_cachep, t);
-	}
+		spin_unlock(&root->fs_info->trans_lock);
 
+		btrfs_put_transaction(t);
+		trace_btrfs_transaction_commit(root);
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+	btrfs_destroy_all_ordered_extents(root->fs_info);
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
+	btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+	btrfs_destroy_all_delalloc_inodes(root->fs_info);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f08e22885c21..134039fd59bb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -57,7 +57,7 @@ static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
 					   __TRANS_JOIN_NOLOCK),
 };
 
-static void put_transaction(struct btrfs_transaction *transaction)
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
@@ -332,7 +332,7 @@ static void wait_current_trans(struct btrfs_root *root)
 		wait_event(root->fs_info->transaction_wait,
 			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
 			   cur_trans->aborted);
-		put_transaction(cur_trans);
+		btrfs_put_transaction(cur_trans);
 	} else {
 		spin_unlock(&root->fs_info->trans_lock);
 	}
@@ -610,7 +610,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 	}
 
 	wait_for_commit(root, cur_trans);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 out:
 	return ret;
 }
@@ -735,7 +735,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
@@ -1515,7 +1515,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
 
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 	return 0;
 }
 
@@ -1559,8 +1559,8 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
 	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
-	put_transaction(cur_trans);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
 	trace_btrfs_transaction_commit(root);
 
@@ -1676,7 +1676,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		wait_for_commit(root, cur_trans);
 
-		put_transaction(cur_trans);
+		btrfs_put_transaction(cur_trans);
 
 		return ret;
 	}
@@ -1693,7 +1693,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 			wait_for_commit(root, prev_trans);
 
-			put_transaction(prev_trans);
+			btrfs_put_transaction(prev_trans);
 		} else {
 			spin_unlock(&root->fs_info->trans_lock);
 		}
@@ -1892,8 +1892,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	list_del_init(&cur_trans->list);
 	spin_unlock(&root->fs_info->trans_lock);
 
-	put_transaction(cur_trans);
-	put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
+	btrfs_put_transaction(cur_trans);
 
 	if (trans->type & __TRANS_FREEZABLE)
 		sb_end_intwrite(root->fs_info->sb);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5c2af8491621..306f88ae1de3 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -166,4 +166,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
 #endif

From 0a4e558609dd4df30a58a07d9eb14c5ddc2c1241 Mon Sep 17 00:00:00 2001
From: Ross Kirk <ross.kirk@gmail.com>
Date: Tue, 24 Sep 2013 10:12:38 +0100
Subject: [PATCH 19/98] btrfs: remove unused parameter from btrfs_header_fsid

Remove unused parameter, 'eb'. Unused since introduction in
5f39d397dfbe140a14edecd4e73c34ce23c4f9ee

Updated to be rebased against current upstream and correct diff supplied this time!

Signed-off-by: Ross Kirk <ross.kirk@gmail.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c   | 10 +++++-----
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c |  6 +++---
 fs/btrfs/ioctl.c   |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6932686b61cf..6c5239b55495 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -274,7 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
@@ -996,7 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
@@ -3152,7 +3152,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
-	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
+	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 
 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
@@ -3291,7 +3291,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
 	write_extent_buffer(split, root->fs_info->fsid,
-			    btrfs_header_fsid(split), BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
@@ -4046,7 +4046,7 @@ again:
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
-			    btrfs_header_fsid(right), BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(right),
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ee0b8aa82828..ee8972b9f1ab 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2667,7 +2667,7 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
 	btrfs_set_header_flags(eb, flags);
 }
 
-static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(void)
 {
 	return offsetof(struct btrfs_header, fsid);
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b131d716d5a5..ade6c0e79616 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -495,7 +495,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	u8 fsid[BTRFS_UUID_SIZE];
 	int ret = 1;
 
-	read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE);
+	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	while (fs_devices) {
 		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 			ret = 0;
@@ -1311,7 +1311,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(leaf, objectid);
 	root->node = leaf;
 
-	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(leaf),
@@ -1398,7 +1398,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->node = leaf;
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
-			    btrfs_header_fsid(root->node), BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(), BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
 	return root;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 385c58f2bba1..2c47e1a400a9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -436,7 +436,7 @@ static noinline int create_subvol(struct inode *dir,
 	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
-	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
 			    btrfs_header_chunk_tree_uuid(leaf),

From 20dd2cbf01888a91fdd921403040a710b275a1ff Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 25 Sep 2013 21:47:45 +0800
Subject: [PATCH 20/98] Btrfs: fix BUG_ON() casued by the reserved space
 migration

When we did space balance and snapshot creation at the same time, we might
meet the following oops:
 kernel BUG at fs/btrfs/inode.c:3038!
 [SNIP]
 Call Trace:
 [<ffffffffa0411ec7>] btrfs_orphan_cleanup+0x293/0x407 [btrfs]
 [<ffffffffa042dc45>] btrfs_mksubvol.isra.28+0x259/0x373 [btrfs]
 [<ffffffffa042de85>] btrfs_ioctl_snap_create_transid+0x126/0x156 [btrfs]
 [<ffffffffa042dff1>] btrfs_ioctl_snap_create_v2+0xd0/0x121 [btrfs]
 [<ffffffffa0430b2c>] btrfs_ioctl+0x414/0x1854 [btrfs]
 [<ffffffff813b60b7>] ? __do_page_fault+0x305/0x379
 [<ffffffff811215a9>] vfs_ioctl+0x1d/0x39
 [<ffffffff81121d7c>] do_vfs_ioctl+0x32d/0x3e2
 [<ffffffff81057fe7>] ? finish_task_switch+0x80/0xb8
 [<ffffffff81121e88>] SyS_ioctl+0x57/0x83
 [<ffffffff813b39ff>] ? do_device_not_available+0x12/0x14
 [<ffffffff813b99c2>] system_call_fastpath+0x16/0x1b
 [SNIP]
 RIP  [<ffffffffa040da40>] btrfs_orphan_add+0xc3/0x126 [btrfs]

The reason of the problem is that the relocation root creation stole
the reserved space, which was reserved for orphan item deletion.

There are several ways to fix this problem, one is to increasing
the reserved space size of the space balace, and then we can use
that space to create the relocation tree for each fs/file trees.
But it is hard to calculate the suitable size because we doesn't
know how many fs/file trees we need relocate.

We fixed this problem by reserving the space for relocation root creation
actively since the space it need is very small (one tree block, used for
root node copy), then we use that reserved space to create the
relocation tree. If we don't reserve space for relocation tree creation,
we will use the reserved space of the balance.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c  |  6 ++++--
 fs/btrfs/transaction.c | 24 +++++++++++++++++++++++-
 fs/btrfs/transaction.h |  1 +
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0359eecac95d..a945374993f7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1383,6 +1383,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *reloc_root;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
+	struct btrfs_block_rsv *rsv;
 	int clear_rsv = 0;
 	int ret;
 
@@ -1396,13 +1397,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
 		return 0;
 
-	if (!trans->block_rsv) {
+	if (!trans->reloc_reserved) {
+		rsv = trans->block_rsv;
 		trans->block_rsv = rc->block_rsv;
 		clear_rsv = 1;
 	}
 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
 	if (clear_rsv)
-		trans->block_rsv = NULL;
+		trans->block_rsv = rsv;
 
 	ret = __add_reloc_root(reloc_root);
 	BUG_ON(ret < 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 134039fd59bb..a213bafe68ec 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -353,6 +353,17 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
 	return 0;
 }
 
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+	if (!root->fs_info->reloc_ctl ||
+	    !root->ref_cows ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+	    root->reloc_root)
+		return false;
+
+	return true;
+}
+
 static struct btrfs_trans_handle *
 start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 		  enum btrfs_reserve_flush_enum flush)
@@ -360,8 +371,9 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
-	int ret;
 	u64 qgroup_reserved = 0;
+	bool reloc_reserved = false;
+	int ret;
 
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return ERR_PTR(-EROFS);
@@ -390,6 +402,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
 		}
 
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+		/*
+		 * Do the reservation for the relocation root creation
+		 */
+		if (unlikely(need_reserve_reloc_root(root))) {
+			num_bytes += root->nodesize;
+			reloc_reserved = true;
+		}
+
 		ret = btrfs_block_rsv_add(root,
 					  &root->fs_info->trans_block_rsv,
 					  num_bytes, flush);
@@ -451,6 +471,7 @@ again:
 	h->delayed_ref_elem.seq = 0;
 	h->type = type;
 	h->allocating_chunk = false;
+	h->reloc_reserved = false;
 	INIT_LIST_HEAD(&h->qgroup_ref_list);
 	INIT_LIST_HEAD(&h->new_bgs);
 
@@ -466,6 +487,7 @@ again:
 					      h->transid, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
+		h->reloc_reserved = reloc_reserved;
 	}
 	h->qgroup_reserved = qgroup_reserved;
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 306f88ae1de3..7657d115067d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -92,6 +92,7 @@ struct btrfs_trans_handle {
 	short aborted;
 	short adding_csums;
 	bool allocating_chunk;
+	bool reloc_reserved;
 	unsigned int type;
 	/*
 	 * this root is only needed to validate that the root passed to

From fa7c14947abea7b30f661d902c9a515056be6d90 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 26 Sep 2013 13:15:27 +0800
Subject: [PATCH 21/98] Btrfs: improve jitter performance of the sequential
 buffered write

The performance was slowed down sometimes when we ran sysbench to measure
the performance of the sequential buffered write by 2 or more threads.

It was because the write order of the test threads might be confused
by the task scheduler, and the coming write would be beyond the end of
the file, in this case, we need insert dummy file extents and create
a hole for the area we skip. But in order to avoid the ongoing ordered
extents which are in the area, we need wait for them. Unfortunately,
the current code doesn't check if there are ordered extents in the area
or not, try to find and flush the dirty pages directly, but in fact,
there is no dirty page in that area, this step of the current code is
unnecessary, and just wastes time. Sometimes, it would increase
the contention of some locks, and makes the performance slow down suddenly.

So we remove the ordered extent flush function before the check, and flush
the dirty pages and wait for the ordered extents only when we find them.

According to my test, we got 1-2 times of the performance regression when
we ran the test by 10 times before applying this patch. After applying
this patch, the regression went away.

Test Environment:
 CPU:		1CPU * 4Cores
 Memory:	6GB
 Partition:	20GB

Test Command:
 # sysbench --test=fileio --file-total-size=16G --file-test-mode=seqwr \
 > --num-threads=512 --file-block-size=16384 --max-time=60 --max-requests=0 run

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0dcb4c3d72d..1ca49eaba3bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4239,15 +4239,16 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		btrfs_wait_ordered_range(inode, hole_start,
-					 block_end - hole_start);
+
 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
 				 &cached_state);
-		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+		ordered = btrfs_lookup_ordered_range(inode, hole_start,
+						     block_end - hole_start);
 		if (!ordered)
 			break;
 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
 				     &cached_state, GFP_NOFS);
+		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 	}
 

From 7d3d1744f8a7d62e4875bd69cc2192a939813880 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Sun, 29 Sep 2013 10:33:16 +0800
Subject: [PATCH 22/98] Btrfs: fix memory leak of chunks' extent map

As we're hold a ref on looking up the extent map, we need to drop the ref
before returning to callers.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 043b215769c2..b691f375d837 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4488,6 +4488,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
 			    "%Lu-%Lu\n", logical, logical+len, em->start,
 			    em->start + em->len);
+		free_extent_map(em);
 		return 1;
 	}
 
@@ -4668,6 +4669,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
 			   "found %Lu-%Lu\n", logical, em->start,
 			   em->start + em->len);
+		free_extent_map(em);
 		return -EINVAL;
 	}
 

From 6174d3cb43aa974d0c8590a3e628ac35ab0bbc13 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 1 Oct 2013 16:13:42 +0100
Subject: [PATCH 23/98] Btrfs: remove unused max_key arg from
 btrfs_search_forward

It is not used for anything.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c       |  1 -
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/ioctl.c       | 16 ++--------------
 fs/btrfs/tree-defrag.c |  2 +-
 fs/btrfs/tree-log.c    |  9 ++-------
 fs/btrfs/uuid-tree.c   |  6 +-----
 fs/btrfs/volumes.c     |  2 +-
 7 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6c5239b55495..33e9dbdd043d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4870,7 +4870,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-			 struct btrfs_key *max_key,
 			 struct btrfs_path *path,
 			 u64 min_trans)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ee8972b9f1ab..9ca15a84cc5a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3309,7 +3309,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
 			u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
-			 struct btrfs_key *max_key,
 			 struct btrfs_path *path,
 			 u64 min_trans);
 enum btrfs_compare_tree_result {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2c47e1a400a9..98d4ffef961e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -842,7 +842,6 @@ static int find_new_extents(struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	struct btrfs_key min_key;
-	struct btrfs_key max_key;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	int type;
@@ -857,15 +856,10 @@ static int find_new_extents(struct btrfs_root *root,
 	min_key.type = BTRFS_EXTENT_DATA_KEY;
 	min_key.offset = *off;
 
-	max_key.objectid = ino;
-	max_key.type = (u8)-1;
-	max_key.offset = (u64)-1;
-
 	path->keep_locks = 1;
 
 	while(1) {
-		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, newer_than);
+		ret = btrfs_search_forward(root, &min_key, path, newer_than);
 		if (ret != 0)
 			goto none;
 		if (min_key.objectid != ino)
@@ -1893,7 +1887,6 @@ static noinline int search_ioctl(struct inode *inode,
 {
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	struct btrfs_key max_key;
 	struct btrfs_path *path;
 	struct btrfs_ioctl_search_key *sk = &args->key;
 	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
@@ -1925,15 +1918,10 @@ static noinline int search_ioctl(struct inode *inode,
 	key.type = sk->min_type;
 	key.offset = sk->min_offset;
 
-	max_key.objectid = sk->max_objectid;
-	max_key.type = sk->max_type;
-	max_key.offset = sk->max_offset;
-
 	path->keep_locks = 1;
 
 	while(1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path,
-					   sk->min_transid);
+		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
 				ret = 0;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 94e05c1f118a..cbaa73c871a9 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -85,7 +85,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 	path->keep_locks = 1;
 
-	ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
+	ret = btrfs_search_forward(root, &key, path, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 79f057c0619a..8ac96c261a11 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2867,7 +2867,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			  u64 min_offset, u64 *last_offset_ret)
 {
 	struct btrfs_key min_key;
-	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src;
 	int err = 0;
@@ -2879,9 +2878,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	u64 ino = btrfs_ino(inode);
 
 	log = root->log_root;
-	max_key.objectid = ino;
-	max_key.offset = (u64)-1;
-	max_key.type = key_type;
 
 	min_key.objectid = ino;
 	min_key.type = key_type;
@@ -2889,8 +2885,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
 	path->keep_locks = 1;
 
-	ret = btrfs_search_forward(root, &min_key, &max_key,
-				   path, trans->transid);
+	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
 
 	/*
 	 * we didn't find anything from this transaction, see if there
@@ -3719,7 +3714,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ins_nr = 0;
-		ret = btrfs_search_forward(root, &min_key, &max_key,
+		ret = btrfs_search_forward(root, &min_key,
 					   path, trans->transid);
 		if (ret != 0)
 			break;
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index dd0dea3766f7..fbda90004fe9 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -260,7 +260,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_root *root = fs_info->uuid_root;
 	struct btrfs_key key;
-	struct btrfs_key max_key;
 	struct btrfs_path *path;
 	int ret = 0;
 	struct extent_buffer *leaf;
@@ -277,13 +276,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
 	key.objectid = 0;
 	key.type = 0;
 	key.offset = 0;
-	max_key.objectid = (u64)-1;
-	max_key.type = (u8)-1;
-	max_key.offset = (u64)-1;
 
 again_search_slot:
 	path->keep_locks = 1;
-	ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+	ret = btrfs_search_forward(root, &key, path, 0);
 	if (ret) {
 		if (ret > 0)
 			ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b691f375d837..c846cc8eb32c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3488,7 +3488,7 @@ static int btrfs_uuid_scan_kthread(void *data)
 	path->keep_locks = 1;
 
 	while (1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+		ret = btrfs_search_forward(root, &key, path, 0);
 		if (ret) {
 			if (ret > 0)
 				ret = 0;

From 3d41d70252234db153ea1b037052278ff5786ad5 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 1 Oct 2013 17:06:53 +0100
Subject: [PATCH 24/98] Btrfs: remove unnecessary tree search when logging
 inode

In tree-log.c:btrfs_log_inode(), we keep calling btrfs_search_forward()
until it returns a key whose objectid is higher than our inode or until
the key's type is higher than our maximum allowed type.

At the end of the loop, we increment our mininum search key's objectid
and type regardless of our desired target objectid and maximum desired
type, which causes another loop iteration that will call again
btrfs_search_forward() just to figure out we've gone beyond our maximum
key and exit the loop. Therefore while incrementing our minimum key,
don't do it blindly and exit the loop immiediately if the next search
key's objectid or type is beyond what we seek.

Also after incrementing the type, set the key's offset to 0, which was
missing and could make us loose some of the inode's items.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ac96c261a11..964c583ea900 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3764,14 +3764,14 @@ next_slot:
 		}
 		btrfs_release_path(path);
 
-		if (min_key.offset < (u64)-1)
+		if (min_key.offset < (u64)-1) {
 			min_key.offset++;
-		else if (min_key.type < (u8)-1)
+		} else if (min_key.type < max_key.type) {
 			min_key.type++;
-		else if (min_key.objectid < (u64)-1)
-			min_key.objectid++;
-		else
+			min_key.offset = 0;
+		} else {
 			break;
+		}
 	}
 	if (ins_nr) {
 		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,

From 778ba82b1796e75e719a52679ae431371ca73988 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 6 Oct 2013 22:22:33 +0100
Subject: [PATCH 25/98] Btrfs: improve inode hash function/inode lookup

Currently the hash value used for adding an inode to the VFS's inode
hash table consists of the plain inode number, which is a 64 bits
integer. This results in hash table buckets (hlist_head lists) with
too many elements for at least 2 important scenarios:

1) When we have many subvolumes. Each subvolume has its own btree
   where its files and directories are added to, and each has its
   own objectid (inode number) namespace. This means that if we have
   N subvolumes, and all have inode number X associated to a file or
   directory, the corresponding inodes all map to the same hash table
   entry, resulting in a bucket (hlist_head list) with N elements;

2) On 32 bits machines. Th VFS hash values are unsigned longs, which
   are 32 bits wide on 32 bits machines, and the inode (objectid)
   numbers are 64 bits unsigned integers. We simply cast the inode
   numbers to hash values, which means that for all inodes with the
   same 32 bits lower half, the same hash bucket is used for all of
   them. For example, all inodes with a number (objectid) between
   0x0000_0000_ffff_ffff and 0xffff_ffff_ffff_ffff will end up in
   the same hash table bucket.

This change ensures the inode's hash value depends both on the
objectid (inode number) and its subvolume's (btree root) objectid.
For 32 bits machines, this change gives better entropy by making
the hash value depend on both the upper and lower 32 bits of the
64 bits hash previously computed.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/btrfs_inode.h | 20 ++++++++++++++++++++
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/inode.c       |  6 ++++--
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 71f074e1870b..ac0b39db27d1 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,6 +19,7 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+#include <linux/hash.h>
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
@@ -179,6 +180,25 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
 
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+					     const struct btrfs_root *root)
+{
+	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+	h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+	return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+	unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+	__insert_inode_hash(inode, h);
+}
+
 static inline u64 btrfs_ino(struct inode *inode)
 {
 	u64 ino = BTRFS_I(inode)->location.objectid;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ade6c0e79616..d205bddc7776 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2294,7 +2294,7 @@ int open_ctree(struct super_block *sb,
 	       sizeof(struct btrfs_key));
 	set_bit(BTRFS_INODE_DUMMY,
 		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
-	insert_inode_hash(fs_info->btree_inode);
+	btrfs_insert_inode_hash(fs_info->btree_inode);
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ca49eaba3bb..bb242f2fb51e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4837,10 +4837,12 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
+	unsigned long hashval = btrfs_inode_hash(objectid, root);
+
 	args.ino = objectid;
 	args.root = root;
 
-	inode = iget5_locked(s, objectid, btrfs_find_actor,
+	inode = iget5_locked(s, hashval, btrfs_find_actor,
 			     btrfs_init_locked_inode,
 			     (void *)&args);
 	return inode;
@@ -5460,7 +5462,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				BTRFS_INODE_NODATASUM;
 	}
 
-	insert_inode_hash(inode);
+	btrfs_insert_inode_hash(inode);
 	inode_tree_add(inode);
 
 	trace_btrfs_inode_new(inode);

From 539f358a30d5113bad81c41a2e7ba8770d6c9f6e Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 7 Oct 2013 13:42:57 +0300
Subject: [PATCH 26/98] Btrfs: fix the dev-replace suspend sequence

Replace progresses strictly from lower to higher offsets, and the
progress is tracked in chunks, by storing the physical offset of the
dev_extent which is being copied in the cursor_left field of
btrfs_dev_replace_item.  When we are done copying the chunk,
left_cursor is updated to point one byte past the dev_extent, so that
on resume we can skip the dev_extents that have already been copied.

There is a major bug (which goes all the way back to the inception of
dev-replace in 3.8) in the way left_cursor is bumped: the bump is done
unconditionally, without any regard to the scrub_chunk return value.
On suspend (and also on any kind of error) scrub_chunk returns early,
i.e. without completing the copy.  This leads to us skipping the chunk
that hasn't been fully copied yet when resuming.

Fix this by doing the cursor_left update only if scrub_chunk ret is 0.
(On suspend scrub_chunk returns with -ECANCELED, so this fix covers
both suspend and error cases.)

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/scrub.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a18e0e23f6a6..f21e2df89bc2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2717,8 +2717,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 		mutex_unlock(&fs_info->scrub_lock);
 		wake_up(&fs_info->scrub_pause_wait);
 
-		dev_replace->cursor_left = dev_replace->cursor_right;
-		dev_replace->item_needs_writeback = 1;
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
@@ -2732,6 +2730,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			break;
 		}
 
+		dev_replace->cursor_left = dev_replace->cursor_right;
+		dev_replace->item_needs_writeback = 1;
+
 		key.offset = found_key.offset + length;
 		btrfs_release_path(path);
 	}

From 80d94fb3df2bf24a500f13181921736fa23b6c3d Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 7 Oct 2013 12:04:28 +0100
Subject: [PATCH 27/98] Btrfs: fix memory leaks on transaction commit failure

Structures of the types tree_mod_elem and qgroup_update are allocated
during transaction commit but were not being released if the call to
btrfs_run_delayed_items() returned an error.

Stack trace reported by kmemleak:

unreferenced object 0xffff880679f0b398 (size 128):
  comm "umount", pid 21508, jiffies 4295967793 (age 36718.112s)
  hex dump (first 32 bytes):
    60 b5 f0 79 06 88 ff ff 00 00 00 00 00 00 00 00  `..y............
    00 00 00 00 00 00 00 00 50 1c 00 00 00 00 00 00  ........P.......
  backtrace:
    [<ffffffff81742d26>] kmemleak_alloc+0x26/0x50
    [<ffffffff811889c2>] kmem_cache_alloc_trace+0x112/0x200
    [<ffffffffa046f2d3>] tree_mod_log_insert_key.constprop.45+0x93/0x150 [btrfs]
    [<ffffffffa04720f9>] __btrfs_cow_block+0x299/0x4f0 [btrfs]
    [<ffffffffa0472510>] btrfs_cow_block+0x120/0x1f0 [btrfs]
    [<ffffffffa0476679>] btrfs_search_slot+0x449/0x930 [btrfs]
    [<ffffffffa048eecf>] btrfs_lookup_inode+0x2f/0xa0 [btrfs]
    [<ffffffffa04eb49c>] __btrfs_update_delayed_inode+0x1c/0x1d0 [btrfs]
    [<ffffffffa04eb9e2>] __btrfs_run_delayed_items+0x162/0x1e0 [btrfs]
    [<ffffffffa04eba63>] btrfs_delayed_inode_exit+0x3/0x20 [btrfs]
    [<ffffffffa0499c63>] btrfs_commit_transaction+0x203/0xa50 [btrfs]
    [<ffffffffa046b519>] btrfs_sync_fs+0x69/0x110 [btrfs]
    [<ffffffff811cb210>] __sync_filesystem+0x30/0x60
    [<ffffffff811cb2bb>] sync_filesystem+0x4b/0x70
    [<ffffffff8119ce7b>] generic_shutdown_super+0x3b/0xf0
    [<ffffffff8119cfc6>] kill_anon_super+0x16/0x30
unreferenced object 0xffff880677e0dd88 (size 32):
  comm "umount", pid 21508, jiffies 4295967793 (age 36718.112s)
  hex dump (first 32 bytes):
    78 75 11 a9 06 88 ff ff 00 c0 e0 77 06 88 ff ff  xu.........w....
    40 c3 a2 70 06 88 ff ff 00 00 00 00 00 00 00 00  @..p............
  backtrace:
    [<ffffffff81742d26>] kmemleak_alloc+0x26/0x50
    [<ffffffff811889c2>] kmem_cache_alloc_trace+0x112/0x200
    [<ffffffffa04fa54f>] btrfs_qgroup_record_ref+0xf/0x90 [btrfs]
    [<ffffffffa04e1914>] btrfs_add_delayed_tree_ref+0xf4/0x170 [btrfs]
    [<ffffffffa048518a>] btrfs_free_tree_block+0x9a/0x220 [btrfs]
    [<ffffffffa0472163>] __btrfs_cow_block+0x303/0x4f0 [btrfs]
    [<ffffffffa0472510>] btrfs_cow_block+0x120/0x1f0 [btrfs]
    [<ffffffffa0476679>] btrfs_search_slot+0x449/0x930 [btrfs]
    [<ffffffffa048eecf>] btrfs_lookup_inode+0x2f/0xa0 [btrfs]
    [<ffffffffa04eb49c>] __btrfs_update_delayed_inode+0x1c/0x1d0 [btrfs]
    [<ffffffffa04eb9e2>] __btrfs_run_delayed_items+0x162/0x1e0 [btrfs]
    [<ffffffffa04eba63>] btrfs_delayed_inode_exit+0x3/0x20 [btrfs]
    [<ffffffffa0499c63>] btrfs_commit_transaction+0x203/0xa50 [btrfs]
    [<ffffffffa046b519>] btrfs_sync_fs+0x69/0x110 [btrfs]
    [<ffffffff811cb210>] __sync_filesystem+0x30/0x60
    [<ffffffff811cb2bb>] sync_filesystem+0x4b/0x70

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/transaction.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a213bafe68ec..277fe812d047 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1600,15 +1600,19 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 	int ret;
 
 	ret = btrfs_run_delayed_items(trans, root);
-	if (ret)
-		return ret;
-
 	/*
 	 * running the delayed items may have added new refs. account
 	 * them now so that they hinder processing of more delayed refs
 	 * as little as possible.
 	 */
-	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+	if (ret) {
+		btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+		return ret;
+	}
+
+	ret = btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+	if (ret)
+		return ret;
 
 	/*
 	 * rename don't use btrfs_join_transaction, so, once we

From eb58bb371a04d3bbab44ec0c5672ce69487bac1e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 10:45:07 -0400
Subject: [PATCH 28/98] Btrfs: do not free the dirty bytes from the trans block
 rsv on cleanup

The transactions should be cleaning up their reservations on failure, this just
causes us to have warnings on unmount because we go negative by free'ing
reservations that have already been free'ed.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d205bddc7776..fdc75ab09483 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4046,8 +4046,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	btrfs_destroy_ordered_operations(cur_trans, root);
 
 	btrfs_destroy_delayed_refs(cur_trans, root);
-	btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
-				cur_trans->dirty_pages.dirty_bytes);
 
 	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);

From 681ae50917df9fd1fae2571fb36a579d1e872b12 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:11:00 -0400
Subject: [PATCH 29/98] Btrfs: cleanup reserved space when freeing tree log on
 error

On error we will wait and free the tree log at unmount without a transaction.
This means that the actual freeing of the blocks doesn't happen which means we
complain about space leaks on unmount.  So to fix this just skip the transaction
specific cleanup part of the tree log free'ing if we don't have a transaction
and that way we can free up our reserved space and our counters stay happy.
Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 47 ++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 964c583ea900..849b72985c74 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2151,11 +2151,13 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 					return ret;
 				}
 
-				btrfs_tree_lock(next);
-				btrfs_set_lock_blocking(next);
-				clean_tree_block(trans, root, next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root, next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
@@ -2227,11 +2229,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 				next = path->nodes[*level];
 
-				btrfs_tree_lock(next);
-				btrfs_set_lock_blocking(next);
-				clean_tree_block(trans, root, next);
-				btrfs_wait_tree_block_writeback(next);
-				btrfs_tree_unlock(next);
+				if (trans) {
+					btrfs_tree_lock(next);
+					btrfs_set_lock_blocking(next);
+					clean_tree_block(trans, root, next);
+					btrfs_wait_tree_block_writeback(next);
+					btrfs_tree_unlock(next);
+				}
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_and_pin_reserved_extent(root,
@@ -2301,11 +2305,13 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			next = path->nodes[orig_level];
 
-			btrfs_tree_lock(next);
-			btrfs_set_lock_blocking(next);
-			clean_tree_block(trans, log, next);
-			btrfs_wait_tree_block_writeback(next);
-			btrfs_tree_unlock(next);
+			if (trans) {
+				btrfs_tree_lock(next);
+				btrfs_set_lock_blocking(next);
+				clean_tree_block(trans, log, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+			}
 
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
@@ -2608,13 +2614,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 		.process_func = process_one_buffer
 	};
 
-	if (trans) {
-		ret = walk_log_tree(trans, log, &wc);
-
-		/* I don't think this can happen but just in case */
-		if (ret)
-			btrfs_abort_transaction(trans, log, ret);
-	}
+	ret = walk_log_tree(trans, log, &wc);
+	/* I don't think this can happen but just in case */
+	if (ret)
+		btrfs_abort_transaction(trans, log, ret);
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,

From 2b1360da35877cd969ed83884d8989ba778254d0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:14:44 -0400
Subject: [PATCH 30/98] Btrfs: free up block groups after everything

If we abort a transaction we will do the tree log cleanup at unmount, but this
happens after we free up the block groups.  This makes all the leak detection
warnings go off because we think we've leaked space but in reality we just
haven't cleaned it up yet.  So instead do the block group cleanup stuff after
free'ing the fs roots so we don't get these warnings.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fdc75ab09483..419968e3eaa0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3637,12 +3637,12 @@ int close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	btrfs_free_block_groups(fs_info);
-
 	btrfs_stop_all_workers(fs_info);
 
 	del_fs_roots(fs_info);
 
+	btrfs_free_block_groups(fs_info);
+
 	free_root_pointers(fs_info, 1);
 
 	iput(fs_info->btree_inode);

From 0be5dc67c445230b0889dba63defba9a9e5561b4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:18:52 -0400
Subject: [PATCH 31/98] Btrfs: fixup reserved trace points

In trying to track down where we were leaking reserved space I noticed our
reserve extent tracepoints are a little off.  First we were saying that the
reserved space had been alloced in btrfs_reserve_extent, which isn't the case,
this needs to be triggered when we actually allocate the space when we run the
delayed ref.  We were also missing a few places where we should have been
tracing the btrfs_reserve_extent_free tracepoint.  With these in place I was
able to put together where we were leaking reserved space.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 54ee542c705e..65401d7ef663 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5262,6 +5262,8 @@ static int pin_down_extent(struct btrfs_root *root,
 
 	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
 			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+	if (reserved)
+		trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
 	return 0;
 }
 
@@ -5965,6 +5967,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
 		pin = 0;
 	}
 out:
@@ -6592,8 +6595,6 @@ again:
 		}
 	}
 
-	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
-
 	return ret;
 }
 
@@ -6705,6 +6706,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 			ins->objectid, ins->offset);
 		BUG();
 	}
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
 	return ret;
 }
 
@@ -6777,6 +6779,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 			ins->objectid, ins->offset);
 		BUG();
 	}
+
+	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
 	return ret;
 }
 

From 857cc2fc29cfaf4ee98fe9967bbf6a3942191136 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 7 Oct 2013 15:21:08 -0400
Subject: [PATCH 32/98] Btrfs: free reserved space on error in a few places

While trying to track down a reserved space leak I noticed a few places where we
won't properly clean up reserved space if we have an error, this patch fixes
those up.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 21 +++++++++++++++++++--
 fs/btrfs/inode.c       |  2 ++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 65401d7ef663..054b11dc8edf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2234,8 +2234,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 
-	if (trans->aborted)
+	if (trans->aborted) {
+		if (insert_reserved)
+			btrfs_pin_extent(root, node->bytenr,
+					 node->num_bytes, 1);
 		return 0;
+	}
 
 	if (btrfs_delayed_ref_is_head(node)) {
 		struct btrfs_delayed_ref_head *head;
@@ -2411,6 +2415,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				btrfs_free_delayed_extent_op(extent_op);
 
 				if (ret) {
+					/*
+					 * Need to reset must_insert_reserved if
+					 * there was an error so the abort stuff
+					 * can cleanup the reserved space
+					 * properly.
+					 */
+					if (must_insert_reserved)
+						locked_ref->must_insert_reserved = 1;
 					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
 					spin_lock(&delayed_refs->lock);
 					btrfs_delayed_ref_unlock(locked_ref);
@@ -6731,13 +6743,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 		size += sizeof(*block_info);
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->leafsize);
 		return -ENOMEM;
+	}
 
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
 	if (ret) {
+		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+						   root->leafsize);
 		btrfs_free_path(path);
 		return ret;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb242f2fb51e..bba7f1a5e9f3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8498,6 +8498,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 						  ins.offset, 0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
 		if (ret) {
+			btrfs_free_reserved_extent(root, ins.objectid,
+						   ins.offset);
 			btrfs_abort_transaction(trans, root, ret);
 			if (own_trans)
 				btrfs_end_transaction(trans, root);

From 294e30fee35d3151d100cfe59e839c2dbc16a374 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 9 Oct 2013 12:00:56 -0400
Subject: [PATCH 33/98] Btrfs: add tests for find_lock_delalloc_range

So both Liu and I made huge messes of find_lock_delalloc_range trying to fix
stuff, me first by fixing extent size, then him by fixing something I broke and
then me again telling him to fix it a different way.  So this is obviously a
candidate for some testing.  This patch adds a pseudo fs so we can allocate fake
inodes for tests that need an inode or pages.  Then it addes a bunch of tests to
make sure find_lock_delalloc_range is acting the way it is supposed to.  With
this patch and all of our previous patches to find_lock_delalloc_range I am sure
it is working as expected now.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile                |   2 +-
 fs/btrfs/ctree.h                 |   6 +
 fs/btrfs/extent_io.c             |   9 +-
 fs/btrfs/extent_io.h             |   6 +
 fs/btrfs/super.c                 |  14 +-
 fs/btrfs/tests/btrfs-tests.c     |  68 ++++++++
 fs/btrfs/tests/btrfs-tests.h     |  15 ++
 fs/btrfs/tests/extent-io-tests.c | 276 +++++++++++++++++++++++++++++++
 include/uapi/linux/magic.h       |   2 +-
 9 files changed, 389 insertions(+), 9 deletions(-)
 create mode 100644 fs/btrfs/tests/btrfs-tests.c
 create mode 100644 fs/btrfs/tests/extent-io-tests.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 4c7dfbfaa3b3..cac4f2d001a3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -15,4 +15,4 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
-	tests/extent-buffer-tests.o
+	tests/extent-buffer-tests.o tests/btrfs-tests.o tests/extent-io-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ca15a84cc5a..2f398062b942 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -47,6 +47,12 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
 #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 
 #define BTRFS_MAX_MIRRORS 3
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2bf6f46fae44..c10291cc4fd1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1598,11 +1598,10 @@ done:
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct inode *inode,
-					     struct extent_io_tree *tree,
-					     struct page *locked_page,
-					     u64 *start, u64 *end,
-					     u64 max_bytes)
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+				    struct extent_io_tree *tree,
+				    struct page *locked_page, u64 *start,
+				    u64 *end, u64 max_bytes)
 {
 	u64 delalloc_start;
 	u64 delalloc_end;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6dbc645f1f3d..f98602eac808 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -345,4 +345,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+				      struct extent_io_tree *tree,
+				      struct page *locked_page, u64 *start,
+				      u64 *end, u64 max_bytes);
+#endif
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0e398657d759..78041e34d15a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1791,10 +1791,20 @@ static int btrfs_run_sanity_tests(void)
 {
 	int ret;
 
-	ret = btrfs_test_free_space_cache();
+	ret = btrfs_init_test_fs();
 	if (ret)
 		return ret;
-	return btrfs_test_extent_buffer_operations();
+
+	ret = btrfs_test_free_space_cache();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_buffer_operations();
+	if (ret)
+		goto out;
+	ret = btrfs_test_extent_io();
+out:
+	btrfs_destroy_test_fs();
+	return ret;
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 000000000000..697d527377c1
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+				       int flags, const char *dev_name,
+				       void *data)
+{
+	return mount_pseudo(fs_type, "btrfs_test:", NULL, NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+	.name		= "btrfs_test_fs",
+	.mount		= btrfs_test_mount,
+	.kill_sb	= kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+	return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+	int ret;
+
+	ret = register_filesystem(&test_type);
+	if (ret) {
+		printk(KERN_ERR "btrfs: cannot register test file system\n");
+		return ret;
+	}
+
+	test_mnt = kern_mount(&test_type);
+	if (IS_ERR(test_mnt)) {
+		printk(KERN_ERR "btrfs: cannot mount test file system\n");
+		unregister_filesystem(&test_type);
+		return ret;
+	}
+	return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+	kern_unmount(test_mnt);
+	unregister_filesystem(&test_type);
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 04f2cd2ca568..e935fe5291e7 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -25,6 +25,10 @@
 
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
@@ -34,6 +38,17 @@ static inline int btrfs_test_extent_buffer_operations(void)
 {
 	return 0;
 }
+static inline int btrfs_init_test_fs(void)
+{
+	return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 000000000000..7e99c2f98dd0
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK		(1 << 0)
+#define PROCESS_RELEASE		(1 << 1)
+#define PROCESS_TEST_LOCKED	(1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+				       unsigned long flags)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int count = 0;
+	int loops = 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (flags & PROCESS_TEST_LOCKED &&
+			    !PageLocked(pages[i]))
+				count++;
+			if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+			if (flags & PROCESS_RELEASE)
+				page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+		loops++;
+		if (loops > 100000) {
+			printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+			break;
+		}
+	}
+	return count;
+}
+
+static int test_find_delalloc(void)
+{
+	struct inode *inode;
+	struct extent_io_tree tmp;
+	struct page *page;
+	struct page *locked_page = NULL;
+	unsigned long index = 0;
+	u64 total_dirty = 256 * 1024 * 1024;
+	u64 max_bytes = 128 * 1024 * 1024;
+	u64 start, end, test_start;
+	u64 found;
+	int ret = -EINVAL;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Failed to allocate test inode\n");
+		return -ENOMEM;
+	}
+
+	extent_io_tree_init(&tmp, &inode->i_data);
+
+	/*
+	 * First go through and create and mark all of our pages dirty, we pin
+	 * everything to make sure our pages don't get evicted and screw up our
+	 * test.
+	 */
+	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			test_msg("Failed to allocate test page\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+		SetPageDirty(page);
+		if (index) {
+			unlock_page(page);
+		} else {
+			page_cache_get(page);
+			locked_page = page;
+		}
+	}
+
+	/* Test this scenario
+	 * |--- delalloc ---|
+	 * |---  search  ---|
+	 */
+	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+	start = 0;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Should have found at least one delalloc\n");
+		goto out_bits;
+	}
+	if (start != 0 || end != 4095) {
+		test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+			 start, end);
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	unlock_page(locked_page);
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 *
+	 * |--- delalloc ---|
+	 *           |--- search ---|
+	 */
+	test_start = 64 * 1024 * 1024;
+	locked_page = find_lock_page(inode->i_mapping,
+				     test_start >> PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Couldn't find the locked page\n");
+		goto out_bits;
+	}
+	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Couldn't find delalloc in our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != max_bytes - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+			 "%Lu\n", test_start, max_bytes - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("There were unlocked pages in the range\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+	/* locked_page was unlocked above */
+	page_cache_release(locked_page);
+
+	/*
+	 * Test this scenario
+	 * |--- delalloc ---|
+	 *                    |--- search ---|
+	 */
+	test_start = max_bytes + 4096;
+	locked_page = find_lock_page(inode->i_mapping, test_start >>
+				     PAGE_CACHE_SHIFT);
+	if (!locked_page) {
+		test_msg("Could'nt find the locked page\n");
+		goto out_bits;
+	}
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (found) {
+		test_msg("Found range when we shouldn't have\n");
+		goto out_bits;
+	}
+	if (end != (u64)-1) {
+		test_msg("Did not return the proper end offset\n");
+		goto out_bits;
+	}
+
+	/*
+	 * Test this scenario
+	 * [------- delalloc -------|
+	 * [max_bytes]|-- search--|
+	 *
+	 * We are re-using our test_start from above since it works out well.
+	 */
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+	start = test_start;
+	end = 0;
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start || end != total_dirty - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, total_dirty - 1, start, end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end,
+			       PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	unlock_extent(&tmp, start, end);
+
+	/*
+	 * Now to test where we run into a page that is no longer dirty in the
+	 * range we want to find.
+	 */
+	page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+			     >> PAGE_CACHE_SHIFT);
+	if (!page) {
+		test_msg("Couldn't find our page\n");
+		goto out_bits;
+	}
+	ClearPageDirty(page);
+	page_cache_release(page);
+
+	/* We unlocked it in the previous test */
+	lock_page(locked_page);
+	start = test_start;
+	end = 0;
+	/*
+	 * Currently if we fail to find dirty pages in the delalloc range we
+	 * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search.  If
+	 * this changes at any point in the future we will need to fix this
+	 * tests expected behavior.
+	 */
+	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+					 &end, max_bytes);
+	if (!found) {
+		test_msg("Didn't find our range\n");
+		goto out_bits;
+	}
+	if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+			 test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+			 end);
+		goto out_bits;
+	}
+	if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+			       PROCESS_UNLOCK)) {
+		test_msg("Pages in range were not all locked\n");
+		goto out_bits;
+	}
+	ret = 0;
+out_bits:
+	clear_extent_bits(&tmp, 0, total_dirty - 1,
+			  (unsigned long)-1, GFP_NOFS);
+out:
+	if (locked_page)
+		page_cache_release(locked_page);
+	process_page_range(inode, 0, total_dirty - 1,
+			   PROCESS_UNLOCK | PROCESS_RELEASE);
+	iput(inode);
+	return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+	test_msg("Running find delalloc tests\n");
+	return test_find_delalloc();
+}
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 2944278a8ba7..77c60311a6c6 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -71,6 +71,6 @@
 #define USBDEVICE_SUPER_MAGIC	0x9fa2
 #define MTD_INODE_FS_MAGIC      0x11307854
 #define ANON_INODE_FS_MAGIC	0x09041934
-
+#define BTRFS_TEST_MAGIC	0x73727279
 
 #endif /* __LINUX_MAGIC_H__ */

From aaedb55bc08f384b7f57dbb3222a511baed4decf Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 11 Oct 2013 14:44:09 -0400
Subject: [PATCH 34/98] Btrfs: add tests for btrfs_get_extent

I'm going to be removing hole extents in the near future so I wanted to make a
sanity test for btrfs_get_extent to make sure I don't break anything in the
meantime.  This patch just puts btrfs_get_extent through its paces by giving it
a completely unreasonable mapping to look at and make sure it is giving us back
maps that make sense.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/Makefile            |   3 +-
 fs/btrfs/ctree.h             |   4 +
 fs/btrfs/inode.c             |   8 +
 fs/btrfs/super.c             |   3 +
 fs/btrfs/tests/btrfs-tests.c |   8 +-
 fs/btrfs/tests/btrfs-tests.h |   5 +
 fs/btrfs/tests/inode-tests.c | 832 +++++++++++++++++++++++++++++++++++
 7 files changed, 861 insertions(+), 2 deletions(-)
 create mode 100644 fs/btrfs/tests/inode-tests.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index cac4f2d001a3..1a44e42d602a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -15,4 +15,5 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
-	tests/extent-buffer-tests.o tests/btrfs-tests.o tests/extent-io-tests.o
+	tests/extent-buffer-tests.o tests/btrfs-tests.o \
+	tests/extent-io-tests.o tests/inode-tests.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2f398062b942..9f5e1cfb0209 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4034,5 +4034,9 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
 	return signal_pending(current);
 }
 
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+#endif
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bba7f1a5e9f3..b5c2ad8fe2ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7795,6 +7795,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	return inode;
 }
 
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
 static void btrfs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 78041e34d15a..1f62c1ccab1f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1802,6 +1802,9 @@ static int btrfs_run_sanity_tests(void)
 	if (ret)
 		goto out;
 	ret = btrfs_test_extent_io();
+	if (ret)
+		goto out;
+	ret = btrfs_test_inodes();
 out:
 	btrfs_destroy_test_fs();
 	return ret;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 697d527377c1..757ef00a75a4 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -24,11 +24,17 @@
 
 static struct vfsmount *test_mnt = NULL;
 
+static const struct super_operations btrfs_test_super_ops = {
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_test_destroy_inode,
+};
+
 static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
 				       int flags, const char *dev_name,
 				       void *data)
 {
-	return mount_pseudo(fs_type, "btrfs_test:", NULL, NULL, BTRFS_TEST_MAGIC);
+	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+			    NULL, BTRFS_TEST_MAGIC);
 }
 
 static struct file_system_type test_type = {
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index e935fe5291e7..b353bc806ca0 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -26,6 +26,7 @@
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
 int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
 int btrfs_init_test_fs(void);
 void btrfs_destroy_test_fs(void);
 struct inode *btrfs_new_test_inode(void);
@@ -49,6 +50,10 @@ static inline int btrfs_test_extent_io(void)
 {
 	return 0;
 }
+static inline int btrfs_test_inodes(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 000000000000..b0fc3bab2a96
--- /dev/null
+++ b/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static struct btrfs_fs_info *alloc_dummy_fs_info(void)
+{
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+						GFP_NOFS);
+	if (!fs_info)
+		return fs_info;
+	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+				      GFP_NOFS);
+	if (!fs_info->fs_devices) {
+		kfree(fs_info);
+		return NULL;
+	}
+	return fs_info;
+}
+static void free_dummy_root(struct btrfs_root *root)
+{
+	if (!root)
+		return;
+	if (root->fs_info) {
+		kfree(root->fs_info->fs_devices);
+		kfree(root->fs_info);
+	}
+	if (root->node)
+		free_extent_buffer(root->node);
+	kfree(root);
+}
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
+			  u64 disk_len, u32 type, u8 compression, int slot)
+{
+	struct btrfs_path path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+	if (type == BTRFS_FILE_EXTENT_INLINE)
+		value_len += len;
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = slot;
+
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, 1);
+	btrfs_set_file_extent_type(leaf, fi, type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+	btrfs_set_file_extent_offset(leaf, fi, offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, len);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen.  We want
+ * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 -  6][6 - 10][10 - 4096][  4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][    hole    ]
+ *
+ * [ 12288 - 20480][20480 - 24576][  24576 - 28672  ][28672 - 36864][36864 - 45056]
+ * [regular1 split][   prealloc1 ][prealloc1 written][   prealloc1 ][ compressed  ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][     65536+81920   ]
+ * [ compressed1 ][  regular  ][compressed1][  regular  ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [  regular  ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+	int slot = 0;
+	u64 disk_bytenr = 1 * 1024 * 1024;
+	u64 offset = 0;
+
+	/* First we want a hole */
+	insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 5;
+
+	/*
+	 * Now we want an inline extent, I don't think this is possible but hey
+	 * why not?  Also keep in mind if we have an inline extent it counts as
+	 * the whole first page.  If we were to expand it we would have to cow
+	 * and we wouldn't have an inline extent anymore.
+	 */
+	insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+		      slot);
+	slot++;
+	offset = 4096;
+
+	/* Now another hole */
+	insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+		      slot);
+	slot++;
+	offset += 4;
+
+	/* Now for a regular extent */
+	insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	disk_bytenr += 4096;
+	offset += 4095;
+
+	/*
+	 * Now for 3 extents that were split from a hole punch so we test
+	 * offsets properly.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+		      0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now for a unwritten prealloc extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+
+	/*
+	 * We want to jack up disk_bytenr a little more so the em stuff doesn't
+	 * merge our records.
+	 */
+	disk_bytenr += 8192;
+
+	/*
+	 * Now for a partially written prealloc extent, basically the same as
+	 * the hole punch example above.  Ram_bytes never changes when you mark
+	 * extents written btw.
+	 */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+		      BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 16384;
+
+	/* Now a normal compressed extent */
+	insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	/* No merges */
+	disk_bytenr += 8192;
+
+	/* Now a split compressed extent */
+	insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 4096;
+	insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+	slot++;
+	offset += 8192;
+	disk_bytenr += 8192;
+
+	/* Now extents that have a hole but no hole extent */
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+	slot++;
+	offset += 16384;
+	disk_bytenr += 4096;
+	insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	u64 orig_start;
+	u64 disk_bytenr;
+	u64 offset;
+	int ret = -ENOMEM;
+
+	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	/*
+	 * We do this since btrfs_get_extent wants to assign em->bdev to
+	 * root->fs_info->fs_devices->latest_bdev.
+	 */
+	root->fs_info = alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(0, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	/*
+	 * We will just free a dummy node if it's ref count is 2 so we need an
+	 * extra ref so our searches don't accidently release our page.
+	 */
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	ret = -EINVAL;
+
+	/* First with no extents */
+	BTRFS_I(inode)->root = root;
+	em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+	if (IS_ERR(em)) {
+		em = NULL;
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+		test_msg("Vacancy flag wasn't set properly\n");
+		goto out;
+	}
+	free_extent_map(em);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+	/*
+	 * All of the magic numbers are based on the mapping setup in
+	 * setup_file_extents, so if you change anything there you need to
+	 * update the comment and update the expected values below.
+	 */
+	setup_file_extents(root);
+
+	em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 5) {
+		test_msg("Unexpected extent wanted start 0 len 5, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_INLINE) {
+		test_msg("Expected an inline, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4091) {
+		test_msg("Unexpected extent wanted start %llu len 1, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	/*
+	 * We don't test anything else for inline since it doesn't get set
+	 * unless we have a page for it to write into.  Maybe we should change
+	 * this?
+	 */
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4) {
+		test_msg("Unexpected extent wanted start %llu len 4, got start "
+			 "%llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Regular extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4095) {
+		test_msg("Unexpected extent wanted start %llu len 4095, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are split extents */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	disk_bytenr += (em->start - orig_start);
+	if (em->block_start != disk_bytenr) {
+		test_msg("Wrong block start, want %llu, have %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* The next 3 are a half written prealloc extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_HOLE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+			 orig_start, em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != prealloc_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 prealloc_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+			 em->orig_start);
+		goto out;
+	}
+	if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+		test_msg("Unexpected block start, wanted %llu, have %llu\n",
+			 disk_bytenr + (em->start - em->orig_start),
+			 em->block_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Now for the compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* Split compressed extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, em->orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	disk_bytenr = em->block_start;
+	orig_start = em->start;
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != disk_bytenr) {
+		test_msg("Block start does not match, want %llu got %llu\n",
+			 disk_bytenr, em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 8192) {
+		test_msg("Unexpected extent wanted start %llu len 8192, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != compressed_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 compressed_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != orig_start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n",
+			 em->start, orig_start);
+		goto out;
+	}
+	if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+		test_msg("Unexpected compress type, wanted %d, got %d\n",
+			 BTRFS_COMPRESS_ZLIB, em->compress_type);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	/* A hole between regular extents but no hole extent */
+	em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	/*
+	 * Currently we just return a length that we requested rather than the
+	 * length of the actual hole, if this changes we'll have to change this
+	 * test.
+	 */
+	if (em->start != offset || em->len != 12288) {
+		test_msg("Unexpected extent wanted start %llu len 12288, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Unexpected flags set, want %lu have %lu\n",
+			 vacancy_only, em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	offset = em->start + em->len;
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != offset || em->len != 4096) {
+		test_msg("Unexpected extent wanted start %llu len 4096, got "
+			 "start %llu len %llu\n", offset, em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+		goto out;
+	}
+	if (em->orig_start != em->start) {
+		test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+			 em->orig_start);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	free_dummy_root(root);
+	return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+	test_msg("Running btrfs_get_extent tests\n");
+	return test_btrfs_get_extent();
+}

From 96192499c27e8b58d71f4370f29ca86d4ca915d7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Wed, 16 Oct 2013 13:53:28 -0400
Subject: [PATCH 35/98] Btrfs: stop all workers after we free block groups

Stefan was hitting a panic in the async worker stuff because we had outstanding
read bios while we were stopping the worker threads.  You could reproduce this
easily if you mount -o nospace_cache and ran generic/273.  This is because the
caching thread stuff is still going and we were stopping all the worker threads.
We need to stop the workers after this work is done, and the free block groups
code will wait for all the caching threads to stop first so we don't run into
this problem.  With this patch we no longer panic.  Thanks,

Cc: stable@vger.kernel.org
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 419968e3eaa0..b0ea9f4bbc70 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3637,12 +3637,12 @@ int close_ctree(struct btrfs_root *root)
 		       percpu_counter_sum(&fs_info->delalloc_bytes));
 	}
 
-	btrfs_stop_all_workers(fs_info);
-
 	del_fs_roots(fs_info);
 
 	btrfs_free_block_groups(fs_info);
 
+	btrfs_stop_all_workers(fs_info);
+
 	free_root_pointers(fs_info, 1);
 
 	iput(fs_info->btree_inode);

From 25a50341b6269b0622434d9360f0c771d219681a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 12:08:38 -0400
Subject: [PATCH 36/98] Btrfs: handle a missing extent for the first file
 extent

While trying to kill our hole extents I noticed I was seeing problems where we
seek into a file and then start writing and then try to fiemap that file later.
This is because we search for offset 0, don't find anything and so back up one
slot, which puts us at the inode ref or something like that, which means we goto
not_found and create an extent map for our entire search area.  This isn't quite
what we want, we want to move forward one slot and see if there is an extent
there so we can limit our hole extent.  This patch fixes this problem, I will
add a testcase for this as well.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b5c2ad8fe2ca..9d39d08a1555 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5982,7 +5982,14 @@ again:
 	found_type = btrfs_key_type(&found_key);
 	if (found_key.objectid != objectid ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		goto not_found;
+		/*
+		 * If we backup past the first extent we want to move forward
+		 * and see if there is an extent in front of us, otherwise we'll
+		 * say there is a hole for our whole search range which can
+		 * cause problems.
+		 */
+		extent_end = start;
+		goto next;
 	}
 
 	found_type = btrfs_file_extent_type(leaf, item);
@@ -5997,7 +6004,7 @@ again:
 		size = btrfs_file_extent_inline_len(leaf, item);
 		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
-
+next:
 	if (start >= extent_end) {
 		path->slots[0]++;
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {

From 0e30db86a4524f9e655e8cb9335b4da1787db3c1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 13:15:02 -0400
Subject: [PATCH 37/98] Btrfs: add a sanity test for a vacant extent at the
 front of a file

Btrfs_get_extent was not handling this case properly, add a test to make sure we
don't regress.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tests/inode-tests.c | 133 +++++++++++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index b0fc3bab2a96..397d1f99a8eb 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -86,6 +86,26 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 }
 
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+	struct btrfs_path path;
+	struct extent_buffer *leaf = root->node;
+	struct btrfs_key key;
+	u32 value_len = 0;
+
+	memset(&path, 0, sizeof(path));
+
+	path.nodes[0] = leaf;
+	path.slots[0] = 0;
+
+	key.objectid = BTRFS_INODE_ITEM_KEY;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	setup_items_for_insert(root, &path, &key, &value_len, value_len,
+			       value_len + sizeof(struct btrfs_item), 1);
+}
+
 /*
  * Build the most complicated map of extents the earth has ever seen.  We want
  * this so we can test all of the corner cases of btrfs_get_extent.  Here is a
@@ -236,10 +256,6 @@ static noinline int test_btrfs_get_extent(void)
 	u64 offset;
 	int ret = -ENOMEM;
 
-	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
-	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
-	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
-
 	inode = btrfs_new_test_inode();
 	if (!inode) {
 		test_msg("Couldn't allocate inode\n");
@@ -825,8 +841,115 @@ out:
 	return ret;
 }
 
+static int test_hole_first(void)
+{
+	struct inode *inode = NULL;
+	struct btrfs_root *root = NULL;
+	struct extent_map *em = NULL;
+	int ret = -ENOMEM;
+
+	inode = btrfs_new_test_inode();
+	if (!inode) {
+		test_msg("Couldn't allocate inode\n");
+		return ret;
+	}
+
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate root\n");
+		goto out;
+	}
+
+	root->fs_info = alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		goto out;
+	}
+
+	root->node = alloc_dummy_extent_buffer(0, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		goto out;
+	}
+
+	extent_buffer_get(root->node);
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	BTRFS_I(inode)->root = root;
+	ret = -EINVAL;
+
+	/*
+	 * Need a blank inode item here just so we don't confuse
+	 * btrfs_get_extent.
+	 */
+	insert_inode_item_key(root);
+	insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+		      BTRFS_FILE_EXTENT_REG, 0, 1);
+	em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != EXTENT_MAP_HOLE) {
+		test_msg("Expected a hole, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 0 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 0 len 4096, got start "
+			 "%llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != vacancy_only) {
+		test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+			 em->flags);
+		goto out;
+	}
+	free_extent_map(em);
+
+	em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+	if (IS_ERR(em)) {
+		test_msg("Got an error when we shouldn't have\n");
+		goto out;
+	}
+	if (em->block_start != 4096) {
+		test_msg("Expected a real extent, got %llu\n", em->block_start);
+		goto out;
+	}
+	if (em->start != 4096 || em->len != 4096) {
+		test_msg("Unexpected extent wanted start 4096 len 4096, got "
+			 "start %llu len %llu\n", em->start, em->len);
+		goto out;
+	}
+	if (em->flags != 0) {
+		test_msg("Unexpected flags set, wanted 0 got %lu\n",
+			 em->flags);
+		goto out;
+	}
+	ret = 0;
+out:
+	if (!IS_ERR(em))
+		free_extent_map(em);
+	iput(inode);
+	free_dummy_root(root);
+	return ret;
+}
+
 int btrfs_test_inodes(void)
 {
+	int ret;
+
+	set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+	set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+	set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
 	test_msg("Running btrfs_get_extent tests\n");
-	return test_btrfs_get_extent();
+	ret = test_btrfs_get_extent();
+	if (ret)
+		return ret;
+	test_msg("Running hole first btrfs_get_extent test\n");
+	return test_hole_first();
 }

From ed9e8af88e2551aaa6bf51d8063a2493e2d71597 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 17:23:08 -0400
Subject: [PATCH 38/98] Btrfs: fix hole check in log_one_extent

I added an assert to make sure we were looking up aligned offsets for csums and
I tripped it when running xfstests.  This is because log_one_extent was checking
if block_start == 0 for a hole instead of EXTENT_MAP_HOLE.  This worked out fine
in practice it seems, but it adds a lot of extra work that is uneeded.  With
this fix I'm no longer tripping my assert.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 849b72985c74..1134aa40b6d1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3373,7 +3373,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_token_file_extent_type(leaf, fi,
 						 BTRFS_FILE_EXTENT_REG,
 						 &token);
-		if (em->block_start == 0)
+		if (em->block_start == EXTENT_MAP_HOLE)
 			skip_csum = true;
 	}
 

From 4277a9c3b3665f2830c55ece015163867b9414cc Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Tue, 15 Oct 2013 09:36:40 -0400
Subject: [PATCH 39/98] Btrfs: add an assert to btrfs_lookup_csums_range for
 alignment

I was hitting weird issues when trying to remove hole extents and it turned out
it was because I was sending non-aligned offsets down to
btrfs_lookup_csums_range.  So add an assert for this in case somebody trips over
this in the future.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/file-item.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4f53159bdb9d..ae8a5139e0ee 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -329,6 +329,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	u64 csum_end;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
+	ASSERT(start == ALIGN(start, root->sectorsize) &&
+	       (end + 1) == ALIGN(end + 1, root->sectorsize));
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;

From 7f4ca37c486733da008778a1f4058fbc194a4fdd Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 18 Oct 2013 11:44:46 -0400
Subject: [PATCH 40/98] Btrfs: fix up seek_hole/seek_data handling

Whoever wrote this was braindead.  Also it doesn't work right if you have
VACANCY's since we assumed you would only have that at the end of the file,
which won't be the case in the near future.  I tested this with generic/285 and
generic/286 as well as the btrfs tests that use fssum since it uses
seek_hole/seek_data to verify things are ok.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/file.c | 94 ++++++++++---------------------------------------
 1 file changed, 19 insertions(+), 75 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72da4df53c9a..bf3465cf455a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2405,14 +2405,12 @@ out_reserve_fail:
 static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map *em;
+	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	u64 lockstart = *offset;
 	u64 lockend = i_size_read(inode);
 	u64 start = *offset;
-	u64 orig_start = *offset;
 	u64 len = i_size_read(inode);
-	u64 last_end = 0;
 	int ret = 0;
 
 	lockend = max_t(u64, root->sectorsize, lockend);
@@ -2429,89 +2427,35 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
 			 &cached_state);
 
-	/*
-	 * Delalloc is such a pain.  If we have a hole and we have pending
-	 * delalloc for a portion of the hole we will get back a hole that
-	 * exists for the entire range since it hasn't been actually written
-	 * yet.  So to take care of this case we need to look for an extent just
-	 * before the position we want in case there is outstanding delalloc
-	 * going on here.
-	 */
-	if (whence == SEEK_HOLE && start != 0) {
-		if (start <= root->sectorsize)
-			em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
-						     root->sectorsize, 0);
-		else
-			em = btrfs_get_extent_fiemap(inode, NULL, 0,
-						     start - root->sectorsize,
-						     root->sectorsize, 0);
-		if (IS_ERR(em)) {
-			ret = PTR_ERR(em);
-			goto out;
-		}
-		last_end = em->start + em->len;
-		if (em->block_start == EXTENT_MAP_DELALLOC)
-			last_end = min_t(u64, last_end, inode->i_size);
-		free_extent_map(em);
-	}
-
-	while (1) {
+	while (start < inode->i_size) {
 		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
 		if (IS_ERR(em)) {
 			ret = PTR_ERR(em);
+			em = NULL;
 			break;
 		}
 
-		if (em->block_start == EXTENT_MAP_HOLE) {
-			if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-				if (last_end <= orig_start) {
-					free_extent_map(em);
-					ret = -ENXIO;
-					break;
-				}
-			}
-
-			if (whence == SEEK_HOLE) {
-				*offset = start;
-				free_extent_map(em);
-				break;
-			}
-		} else {
-			if (whence == SEEK_DATA) {
-				if (em->block_start == EXTENT_MAP_DELALLOC) {
-					if (start >= inode->i_size) {
-						free_extent_map(em);
-						ret = -ENXIO;
-						break;
-					}
-				}
-
-				if (!test_bit(EXTENT_FLAG_PREALLOC,
-					      &em->flags)) {
-					*offset = start;
-					free_extent_map(em);
-					break;
-				}
-			}
-		}
+		if (whence == SEEK_HOLE &&
+		    (em->block_start == EXTENT_MAP_HOLE ||
+		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
+		else if (whence == SEEK_DATA &&
+			   (em->block_start != EXTENT_MAP_HOLE &&
+			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+			break;
 
 		start = em->start + em->len;
-		last_end = em->start + em->len;
-
-		if (em->block_start == EXTENT_MAP_DELALLOC)
-			last_end = min_t(u64, last_end, inode->i_size);
-
-		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
-			free_extent_map(em);
-			ret = -ENXIO;
-			break;
-		}
 		free_extent_map(em);
+		em = NULL;
 		cond_resched();
 	}
-	if (!ret)
-		*offset = min(*offset, inode->i_size);
-out:
+	free_extent_map(em);
+	if (!ret) {
+		if (whence == SEEK_DATA && start >= inode->i_size)
+			ret = -ENXIO;
+		else
+			*offset = min_t(loff_t, start, inode->i_size);
+	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			     &cached_state, GFP_NOFS);
 	return ret;

From 452c75c3d2187089f6e846710e6ea7883bf30f8a Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Mon, 7 Oct 2013 10:45:25 -0500
Subject: [PATCH 41/98] Btrfs: Simplify the logic in alloc_extent_buffer() for
 existing extent buffer case

alloc_extent_buffer() uses radix_tree_lookup() when radix_tree_insert()
fails with EEXIST. That part of the code is very similar to the code in
find_extent_buffer(). This patch replaces radix_tree_lookup() and
surrounding code in alloc_extent_buffer() with find_extent_buffer().

Note that radix_tree_lookup() does not need to be protected by
tree->buffer_lock. It is protected by eb->refs.

While at it, this patch
  - changes the other usage of radix_tree_lookup() in alloc_extent_buffer()
    with find_extent_buffer() to reduce redundancy.
  - removes the unused argument 'len' to find_extent_buffer().

Signed-Off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Reviewed-by: Zach Brown <zab@redhat.com>

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c   |  3 +--
 fs/btrfs/extent_io.c | 63 ++++++++++++++++++--------------------------
 fs/btrfs/extent_io.h |  2 +-
 3 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ea9f4bbc70..ebc784aba3f5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1104,8 +1104,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				bytenr, blocksize);
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
 	return eb;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c10291cc4fd1..5439f24ab06b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4486,6 +4486,23 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
 	}
 }
 
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 		u64 start)
+{
+	struct extent_buffer *eb;
+
+	rcu_read_lock();
+	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
+	if (eb && atomic_inc_not_zero(&eb->refs)) {
+		rcu_read_unlock();
+		mark_extent_buffer_accessed(eb);
+		return eb;
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 					  u64 start, unsigned long len)
 {
@@ -4499,14 +4516,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	int uptodate = 1;
 	int ret;
 
-	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (eb && atomic_inc_not_zero(&eb->refs)) {
-		rcu_read_unlock();
-		mark_extent_buffer_accessed(eb);
+
+	eb = find_extent_buffer(tree, start);
+	if (eb)
 		return eb;
-	}
-	rcu_read_unlock();
 
 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
 	if (!eb)
@@ -4565,24 +4578,17 @@ again:
 
 	spin_lock(&tree->buffer_lock);
 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+	spin_unlock(&tree->buffer_lock);
+	radix_tree_preload_end();
 	if (ret == -EEXIST) {
-		exists = radix_tree_lookup(&tree->buffer,
-						start >> PAGE_CACHE_SHIFT);
-		if (!atomic_inc_not_zero(&exists->refs)) {
-			spin_unlock(&tree->buffer_lock);
-			radix_tree_preload_end();
-			exists = NULL;
+		exists = find_extent_buffer(tree, start);
+		if (exists)
+			goto free_eb;
+		else
 			goto again;
-		}
-		spin_unlock(&tree->buffer_lock);
-		radix_tree_preload_end();
-		mark_extent_buffer_accessed(exists);
-		goto free_eb;
 	}
 	/* add one reference for the tree */
 	check_buffer_tree_ref(eb);
-	spin_unlock(&tree->buffer_lock);
-	radix_tree_preload_end();
 
 	/*
 	 * there is a race where release page may have
@@ -4613,23 +4619,6 @@ free_eb:
 	return exists;
 }
 
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 u64 start, unsigned long len)
-{
-	struct extent_buffer *eb;
-
-	rcu_read_lock();
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (eb && atomic_inc_not_zero(&eb->refs)) {
-		rcu_read_unlock();
-		mark_extent_buffer_accessed(eb);
-		return eb;
-	}
-	rcu_read_unlock();
-
-	return NULL;
-}
-
 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
 {
 	struct extent_buffer *eb =
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f98602eac808..19620c58f096 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -271,7 +271,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-					 u64 start, unsigned long len);
+					 u64 start);
 void free_extent_buffer(struct extent_buffer *eb);
 void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0

From efd0c4055a26e88689b3c0cf95492dfe147cba67 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 7 Oct 2013 21:20:44 +0100
Subject: [PATCH 42/98] Btrfs: remove unnecessary key copy when logging inode

The btrfs_insert_empty_item() function doesn't modify its
key argument.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1134aa40b6d1..f2e0531dc719 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3170,11 +3170,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
 			  struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_key key;
 	int ret;
 
-	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
-	ret = btrfs_insert_empty_item(trans, log, path, &key,
+	ret = btrfs_insert_empty_item(trans, log, path,
+				      &BTRFS_I(inode)->location,
 				      sizeof(*inode_item));
 	if (ret && ret != -EEXIST)
 		return ret;

From 8319bfe13642ea2d411c885400377d5fc6f32271 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 8 Oct 2013 18:19:14 +0800
Subject: [PATCH 43/98] Btrfs: cleanup dead code of defragment

@is_extent is no more needed since we don't defrag extent root.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-defrag.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cbaa73c871a9..76928ca97741 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -37,7 +37,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int wret;
 	int level;
-	int is_extent = 0;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
 	u64 min_trans = 0;
@@ -50,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	if (root->ref_cows == 0 && !is_extent)
+	if (root->ref_cows == 0)
 		goto out;
 
 	if (btrfs_test_opt(root, SSD))

From 498456d33e2ee5150f045e604e4531b088083e7a Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 8 Oct 2013 18:19:43 +0800
Subject: [PATCH 44/98] Btrfs: kill unused code in btrfs_search_forward

After commit de78b51a2852bddccd6535e9e12de65f92787a1e
(btrfs: remove cache only arguments from defrag path), @blockptr is no more
used.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 33e9dbdd043d..8f3d6f893585 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4914,10 +4914,8 @@ again:
 		 * If it is too old, old, skip to the next one.
 		 */
 		while (slot < nritems) {
-			u64 blockptr;
 			u64 gen;
 
-			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
 			if (gen < min_trans) {
 				slot++;

From 03b2f08b5f96db5d07dba54ae008e9fdf1396e05 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Fri, 11 Oct 2013 15:35:45 -0300
Subject: [PATCH 45/98] btrfs: Fix memory leakage in the tree-log.c

In add_inode_ref() function:

Initializes local pointers.

Reduces the logical condition with the __add_inode_ref() return
value by using only one 'goto out'.

Centralizes the exiting, ensuring the freeing of all used memory.

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Reviewed-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f2e0531dc719..d12696db9ada 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1113,11 +1113,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb, int slot,
 				  struct btrfs_key *key)
 {
-	struct inode *dir;
-	struct inode *inode;
+	struct inode *dir = NULL;
+	struct inode *inode = NULL;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
-	char *name;
+	char *name = NULL;
 	int namelen;
 	int ret;
 	int search_done = 0;
@@ -1150,13 +1150,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	 * care of the rest
 	 */
 	dir = read_one_inode(root, parent_objectid);
-	if (!dir)
-		return -ENOENT;
+	if (!dir) {
+		ret = -ENOENT;
+		goto out;
+	}
 
 	inode = read_one_inode(root, inode_objectid);
 	if (!inode) {
-		iput(dir);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 
 	while (ref_ptr < ref_end) {
@@ -1169,14 +1171,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 			 */
 			if (!dir)
 				dir = read_one_inode(root, parent_objectid);
-			if (!dir)
-				return -ENOENT;
+			if (!dir) {
+				ret = -ENOENT;
+				goto out;
+			}
 		} else {
 			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
 					     &ref_index);
 		}
 		if (ret)
-			return ret;
+			goto out;
 
 		/* if we already have a perfect match, we're done */
 		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
@@ -1196,12 +1200,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 						      parent_objectid,
 						      ref_index, name, namelen,
 						      &search_done);
-				if (ret == 1) {
-					ret = 0;
+				if (ret) {
+					if (ret == 1)
+						ret = 0;
 					goto out;
 				}
-				if (ret)
-					goto out;
 			}
 
 			/* insert our name */
@@ -1215,6 +1218,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 
 		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
 		kfree(name);
+		name = NULL;
 		if (log_ref_ver) {
 			iput(dir);
 			dir = NULL;
@@ -1225,6 +1229,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 out:
 	btrfs_release_path(path);
+	kfree(name);
 	iput(dir);
 	iput(inode);
 	return ret;

From f747cab7b75ba298df3ac234de9f1655957786cc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 10 Oct 2013 20:37:29 +0300
Subject: [PATCH 46/98] Btrfs: nuke a bogus rw_devices decrement in
 __btrfs_close_devices

On mount failures, __btrfs_close_devices can be called well before
dev-replace state is read and ->is_tgtdev_for_dev_replace is set.  This
leads to a bogus decrement of ->rw_devices and sets off a WARN_ON in
__btrfs_close_devices if replace target device happens to be on the
lists and we fail early in the mount sequence.  Fix this by checking
the devid instead of ->is_tgtdev_for_dev_replace before the decrement:
for replace targets devid is always equal to BTRFS_DEV_REPLACE_DEVID.

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c846cc8eb32c..9dfe038015f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -666,7 +666,8 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		if (device->bdev)
 			fs_devices->open_devices--;
 
-		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 			list_del_init(&device->dev_alloc_list);
 			fs_devices->rw_devices--;
 		}

From adfa97cbdfe376b02bb3e1ea3166958fec35ca6f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 10 Oct 2013 20:39:28 +0300
Subject: [PATCH 47/98] Btrfs: don't leak ioctl args in btrfs_ioctl_dev_replace

struct btrfs_ioctl_dev_replace_args memory is leaked if replace is
requested on a read-only filesystem.  Fix it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 98d4ffef961e..08ac53c5f78d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3667,9 +3667,10 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
 	switch (p->cmd) {
 	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
-		if (root->fs_info->sb->s_flags & MS_RDONLY)
-			return -EROFS;
-
+		if (root->fs_info->sb->s_flags & MS_RDONLY) {
+			ret = -EROFS;
+			goto out;
+		}
 		if (atomic_xchg(
 			&root->fs_info->mutually_exclusive_operation_running,
 			1)) {
@@ -3695,7 +3696,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
 	if (copy_to_user(arg, p, sizeof(*p)))
 		ret = -EFAULT;
-
+out:
 	kfree(p);
 	return ret;
 }

From e649e587cbc66287b2a4bff8b2113ad679a2b8d8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 10 Oct 2013 20:40:21 +0300
Subject: [PATCH 48/98] Btrfs: disallow 'btrfs {balance,replace} cancel' on ro
 mounts

For both balance and replace, cancelling involves changing the on-disk
state and committing a transaction, which is not a good thing to do on
read-only filesystems.

Cc: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c | 3 +++
 fs/btrfs/volumes.c     | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9efb94e95858..98df26181349 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -650,6 +650,9 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	u64 result;
 	int ret;
 
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 	btrfs_dev_replace_lock(dev_replace);
 	switch (dev_replace->replace_state) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9dfe038015f7..5d7ea267d8ff 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3424,6 +3424,9 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
 
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	mutex_lock(&fs_info->balance_mutex);
 	if (!fs_info->balance_ctl) {
 		mutex_unlock(&fs_info->balance_mutex);

From 30d133fc221a0ec50030c33aa7bf6931503894c7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Fri, 11 Oct 2013 16:30:23 +0800
Subject: [PATCH 49/98] Btrfs: fixup error path in __btrfs_inc_extent_ref

When we fail to add a reference after a non-inline insertion by some reasons,
eg. ENOSPC, we'll abort the transaction, but we don't return this error to
the caller who has to walk around again to find something wrong, that's
unnecessary.

Also fixup other error paths to keep it simple.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 054b11dc8edf..c4971108f126 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1979,7 +1979,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 	u64 refs;
 	int ret;
-	int err = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1992,14 +1991,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 					   path, bytenr, num_bytes, parent,
 					   root_objectid, owner, offset,
 					   refs_to_add, extent_op);
-	if (ret == 0)
+	if (ret != -EAGAIN)
 		goto out;
 
-	if (ret != -EAGAIN) {
-		err = ret;
-		goto out;
-	}
-
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, item);
@@ -2021,7 +2015,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, root, ret);
 out:
 	btrfs_free_path(path);
-	return err;
+	return ret;
 }
 
 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,

From 27087f370174ebb298cdf6dcb36b1e4dcbe34e93 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 11 Oct 2013 15:20:42 +0200
Subject: [PATCH 50/98] Btrfs: init device stats for new devices

Device stats are only initialized (read from tree items) on mount.
Trying to read device stats after adding or replacing new devices will
return errors.

btrfs_init_new_device() and btrfs_init_dev_replace_tgtdev() are the two
functions that allocate and initialize new btrfs_device structures after
a filesystem is mounted. They set the device stats to zero by using
kzalloc() which is correct for new devices. The only missing thing was
to declare these stats as being valid (device->dev_stats_valid = 1) and
this patch adds this missing code.

This is the reproducer:

TEST_DEV1=/dev/sdzzzzz1
TEST_DEV2=/dev/sdzzzzz2
TEST_DEV3=/dev/sdzzzzz3
TEST_MNT=/mnt
mkfs.btrfs $TEST_DEV1
mount $TEST_DEV1 $TEST_MNT
btrfs device add $TEST_DEV2 $TEST_MNT
btrfs device stat $TEST_MNT
btrfs replace start -B $TEST_DEV2 $TEST_DEV3 $TEST_MNT
btrfs device stat $TEST_MNT
umount $TEST_MNT

Reported-by: Ondrej Kunc <kunc88@gmail.com>
Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5d7ea267d8ff..f492f7eabf29 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2042,6 +2042,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->in_fs_metadata = 1;
 	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
@@ -2209,6 +2210,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
 	device->in_fs_metadata = 1;
 	device->is_tgtdev_for_dev_replace = 1;
 	device->mode = FMODE_EXCL;
+	device->dev_stats_valid = 1;
 	set_blocksize(device->bdev, 4096);
 	device->fs_devices = fs_info->fs_devices;
 	list_add(&device->dev_list, &fs_info->fs_devices->devices);

From 361c093d7f99c3f6cbb07d5c580ce778ab418c42 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 11 Oct 2013 17:14:58 +0200
Subject: [PATCH 51/98] Btrfs: Wait for uuid-tree rebuild task on remount
 read-only

If the user remounts the filesystem read-only while the uuid-tree
scan and rebuild task is still running (this happens once after the
filesystem was mounted with an old kernel, or when forced with the
mount options), the remount should wait on the tasks completion
before setting the filesystem read-only. Otherwise the background
task continues to write to the filesystem which is apparently not
what users expect.

The reproducer:

TEST_DEV=/dev/sdzzzzz1
TEST_MNT=/mnt
mkfs.btrfs -f $TEST_DEV
mount $TEST_DEV $TEST_MNT
for i in `seq 50000`; do btrfs subvolume create ${TEST_MNT}/$i; done
umount $TEST_MNT
mount $TEST_DEV $TEST_MNT -o rescan_uuid_tree
sleep 1
ps -elf | fgrep '[btrfs-uuid]' | grep -v grep
mount $TEST_DEV $TEST_MNT -o ro,remount
ps -elf | fgrep '[btrfs-uuid]' | grep -v grep
sleep 1
umount $TEST_MNT

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1f62c1ccab1f..02f552f11829 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1330,6 +1330,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		 * this also happens on 'umount -rf' or on shutdown, when
 		 * the filesystem is busy.
 		 */
+
+		/* wait for the uuid_scan task to finish */
+		down(&fs_info->uuid_tree_rescan_sem);
+		/* avoid complains from lockdep et al. */
+		up(&fs_info->uuid_tree_rescan_sem);
+
 		sb->s_flags |= MS_RDONLY;
 
 		btrfs_dev_replace_suspend_for_unmount(fs_info);

From 3c77bd94ecb4ad2653d1e3eb22295018533a1e21 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 12 Oct 2013 20:32:59 +0100
Subject: [PATCH 52/98] Btrfs: don't leak delayed node on path allocation
 failure

If the path allocation failed, we would return without decrementing
the reference count in the delayed node we got before, resulting
in a leak.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/delayed-inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cbd9523ad09c..df1a496bc8d4 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1174,8 +1174,10 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	mutex_unlock(&delayed_node->mutex);
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		btrfs_release_delayed_node(delayed_node);
 		return -ENOMEM;
+	}
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;

From 5ede859b00af261c78b3848a362253c76652e035 Mon Sep 17 00:00:00 2001
From: chandan <chandan@linux.vnet.ibm.com>
Date: Mon, 14 Oct 2013 18:44:39 +0530
Subject: [PATCH 53/98] Btrfs: btrfs_add_ordered_operation: Fix last modified
 transaction comparison.

Comparison of an inode's last modified transaction with the last committed
transaction is incorrect. Fix it.

Signed-off-by: chandan <chandan@linux.vnet.ibm.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ordered-data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c702cb62f78a..1a36a0c3ee27 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1076,7 +1076,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 	 * if this file hasn't been changed since the last transaction
 	 * commit, we can safely return without doing anything
 	 */
-	if (last_mod < root->fs_info->last_trans_committed)
+	if (last_mod <= root->fs_info->last_trans_committed)
 		return;
 
 	spin_lock(&root->fs_info->ordered_root_lock);

From 229eed4348a482c11cf8d494392305a85ed478f9 Mon Sep 17 00:00:00 2001
From: "Geyslan G. Bem" <geyslan@gmail.com>
Date: Mon, 14 Oct 2013 12:18:25 -0300
Subject: [PATCH 54/98] btrfs: simplify kmalloc+copy_from_user to memdup_user

Use memdup_user rather than duplicating its implementation
This is a little bit restricted to reduce false positives

The semantic patch that makes this report is available
in scripts/coccinelle/api/memdup_user.cocci.

More information about semantic patching is available at
http://coccinelle.lip6.fr/

Signed-off-by: Geyslan G. Bem <geyslan@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 08ac53c5f78d..864aab4b1bfa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2715,15 +2715,10 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 	size = sizeof(tmp) +
 		tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
 
-	same = kmalloc(size, GFP_NOFS);
-	if (!same) {
-		ret = -EFAULT;
-		goto out;
-	}
+	same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size);
 
-	if (copy_from_user(same,
-			   (struct btrfs_ioctl_same_args __user *)argp, size)) {
-		ret = -EFAULT;
+	if (IS_ERR(same)) {
+		ret = PTR_ERR(same);
 		goto out;
 	}
 

From e93ae26fe1123e9077e79cc7af8a0d42adf4812f Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 14 Oct 2013 22:49:11 +0100
Subject: [PATCH 55/98] Btrfs: optimize tree-log.c:count_inode_refs()

Avoid repeated tree searches by processing all inode ref items in
a leaf at once instead of processing one at a time, followed by a
path release and a tree search for a key with a decremented offset.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d12696db9ada..d45c04b30690 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1312,6 +1312,7 @@ static int count_inode_refs(struct btrfs_root *root,
 				break;
 			path->slots[0]--;
 		}
+process_slot:
 		btrfs_item_key_to_cpu(path->nodes[0], &key,
 				      path->slots[0]);
 		if (key.objectid != ino ||
@@ -1332,6 +1333,10 @@ static int count_inode_refs(struct btrfs_root *root,
 
 		if (key.offset == 0)
 			break;
+		if (path->slots[0] > 0) {
+			path->slots[0]--;
+			goto process_slot;
+		}
 		key.offset--;
 		btrfs_release_path(path);
 	}

From e8b0d724d596f2ac1264ad830a04ef8e415be956 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 15 Oct 2013 00:12:27 +0100
Subject: [PATCH 56/98] Btrfs: fix btrfs_prev_leaf() previous key computation

If we decrement the key type, we must reset its offset to the largest
possible offset (u64)-1. If we decrement the key's objectid, then we
must reset the key's type and offset to their largest possible values,
(u8)-1 and (u64)-1 respectively. Not doing so can make us miss an
items in the tree.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8f3d6f893585..a749121203df 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4827,14 +4827,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-	if (key.offset > 0)
+	if (key.offset > 0) {
 		key.offset--;
-	else if (key.type > 0)
+	} else if (key.type > 0) {
 		key.type--;
-	else if (key.objectid > 0)
+		key.offset = (u64)-1;
+	} else if (key.objectid > 0) {
 		key.objectid--;
-	else
+		key.type = (u8)-1;
+		key.offset = (u64)-1;
+	} else {
 		return 1;
+	}
 
 	btrfs_release_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);

From ff76b0565523319d7c1c0b51d5a5a8915d33efab Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Tue, 15 Oct 2013 20:08:15 +0200
Subject: [PATCH 57/98] Btrfs: Don't allocate inode that is already in use

Due to an off-by-one error, it is possible to reproduce a bug
when the inode cache is used.

The same inode number is assigned twice, the second time this
leads to an EEXIST in btrfs_insert_empty_items().

The issue can happen when a file is removed right after a subvolume
is created and then a new inode number is created before the
inodes in free_inode_pinned are processed.
unlink() calls btrfs_return_ino() which calls start_caching() in this
case which adds [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID] by
searching for the highest inode (which already cannot find the
unlinked one anymore in btrfs_find_free_objectid()). So if this
unlinked inode's number is equal to the highest_ino + 1 (or >= this value
instead of > this value which was the off-by-one error), we mustn't add
the inode number to free_ino_pinned (caching_thread() does it right).
In this case we need to try directly to add the number to the inode_cache
which will fail in this case.

When this inode number is allocated while it is still in free_ino_pinned,
it is allocated and still added to the free inode cache when the
pinned inodes are processed, thus one of the following inode number
allocations will get an inode that is already in use and fail with EEXIST
in btrfs_insert_empty_items().

One example which was created with the reproducer below:
Create a snapshot, work in the newly created snapshot for the rest.
In unlink(inode 34284) call btrfs_return_ino() which calls start_caching().
start_caching() calls add_free_space [34284, 18446744073709517077].
In btrfs_return_ino(), call start_caching pinned [34284, 1] which is wrong.
mkdir() call btrfs_find_ino_for_alloc() which returns the number 34284.
btrfs_unpin_free_ino calls add_free_space [34284, 1].
mkdir() call btrfs_find_ino_for_alloc() which returns the number 34284.
EEXIST when the new inode is inserted.

One possible reproducer is this one:
 #!/bin/sh
 # preparation
TEST_DEV=/dev/sdc1
TEST_MNT=/mnt
umount ${TEST_MNT} 2>/dev/null || true
mkfs.btrfs -f ${TEST_DEV}
mount ${TEST_DEV} ${TEST_MNT} -o \
 rw,relatime,compress=lzo,space_cache,inode_cache
btrfs subv create ${TEST_MNT}/s1
for i in `seq 34027`; do touch ${TEST_MNT}/s1/${i}; done
btrfs subv snap ${TEST_MNT}/s1 ${TEST_MNT}/s2
FILENAME=`find ${TEST_MNT}/s1/ -inum 4085 | sed 's|^.*/\([^/]*\)$|\1|'`
rm ${TEST_MNT}/s2/$FILENAME
touch ${TEST_MNT}/s2/$FILENAME
 # the following steps can be repeated to reproduce the issue again and again
[ -e ${TEST_MNT}/s3 ] && btrfs subv del ${TEST_MNT}/s3
btrfs subv snap ${TEST_MNT}/s2 ${TEST_MNT}/s3
rm ${TEST_MNT}/s3/$FILENAME
touch ${TEST_MNT}/s3/$FILENAME
ls -alFi ${TEST_MNT}/s?/$FILENAME
touch ${TEST_MNT}/s3/_1 || logger FAILED
ls -alFi ${TEST_MNT}/s?/_1
touch ${TEST_MNT}/s3/_2 || logger FAILED
ls -alFi ${TEST_MNT}/s?/_2
touch ${TEST_MNT}/s3/__1 || logger FAILED
ls -alFi ${TEST_MNT}/s?/__1
touch ${TEST_MNT}/s3/__2 || logger FAILED
ls -alFi ${TEST_MNT}/s?/__2
 # if the above is not enough, add the following loop:
for i in `seq 3 9`; do touch ${TEST_MNT}/s3/__${i} || logger FAILED; done
 #for i in `seq 3 34027`; do touch ${TEST_MNT}/s3/__${i} || logger FAILED; done
 # one of the touch(1) calls in s3 fail due to EEXIST because the inode is
 # already in use that btrfs_find_ino_for_alloc() returns.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Reviewed-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode-map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 014de497e888..ec08004a6a20 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -237,7 +237,7 @@ again:
 		start_caching(root);
 
 		if (objectid <= root->cache_progress ||
-		    objectid > root->highest_objectid)
+		    objectid >= root->highest_objectid)
 			__btrfs_add_free_space(ctl, objectid, 1);
 		else
 			__btrfs_add_free_space(pinned, objectid, 1);

From 8185554d3eb09d23a805456b6fa98dcbb34aa518 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 15 Oct 2013 18:44:00 +0100
Subject: [PATCH 58/98] Btrfs: fix incorrect inode acl reset

When a directory has a default ACL and a subdirectory is created
under that directory, btrfs_init_acl() is called when the
subdirectory's inode is created to initialize the inode's ACL
(inherited from the parent directory) but it was clearing the ACL
from the inode after setting it if posix_acl_create() returned
success, instead of clearing it only if it returned an error.

To reproduce this issue:

$ mkfs.btrfs -f /dev/loop0
$ mount /dev/loop0 /mnt
$ mkdir /mnt/acl
$ setfacl -d --set u::rwx,g::rwx,o::- /mnt/acl
$ getfacl /mnt/acl
user::rwx
group::rwx
other::r-x
default:user::rwx
default:group::rwx
default:other::---

$ mkdir /mnt/acl/dir1
$ getfacl /mnt/acl/dir1
user::rwx
group::rwx
other::---

After unmounting and mounting again the filesystem, fgetacl returned the
expected ACL:

$ umount /mnt/acl
$ mount /dev/loop0 /mnt
$ getfacl /mnt/acl/dir1
user::rwx
group::rwx
other::---
default:user::rwx
default:group::rwx
default:other::---

Meaning that the underlying xattr was persisted.

Reported-by: Giuseppe Fierro <giuseppe@fierro.org>
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index e15d2b0d8d3b..0890c83643e9 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -229,7 +229,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		if (ret > 0) {
 			/* we need an acl */
 			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-		} else {
+		} else if (ret < 0) {
 			cache_no_acl(inode);
 		}
 	} else {

From 4546bcaeba435c1d0b7f38c011cbb2367497ca8d Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Wed, 16 Oct 2013 12:10:32 -0700
Subject: [PATCH 59/98] btrfs: use get_seconds() instead of btrfs wrapper

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 98df26181349..a36343a2dd68 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -38,7 +38,6 @@
 #include "rcu-string.h"
 #include "dev-replace.h"
 
-static u64 btrfs_get_seconds_since_1970(void);
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 				       int scrub_ret);
 static void btrfs_dev_replace_update_device_in_mapping_tree(
@@ -296,13 +295,6 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 		dev_replace->cursor_left_last_write_of_item;
 }
 
-static u64 btrfs_get_seconds_since_1970(void)
-{
-	struct timespec t = CURRENT_TIME_SEC;
-
-	return t.tv_sec;
-}
-
 int btrfs_dev_replace_start(struct btrfs_root *root,
 			    struct btrfs_ioctl_dev_replace_args *args)
 {
@@ -390,7 +382,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	 * go to the tgtdev as well (refer to btrfs_map_block()).
 	 */
 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
-	dev_replace->time_started = btrfs_get_seconds_since_1970();
+	dev_replace->time_started = get_seconds();
 	dev_replace->cursor_left = 0;
 	dev_replace->committed_cursor_left = 0;
 	dev_replace->cursor_left_last_write_of_item = 0;
@@ -493,7 +485,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 	dev_replace->tgtdev = NULL;
 	dev_replace->srcdev = NULL;
-	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
 
 	if (scrub_ret) {
@@ -671,7 +663,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 		break;
 	}
 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
-	dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
 	btrfs_dev_replace_unlock(dev_replace);
 	btrfs_scrub_cancel(fs_info);
@@ -706,7 +698,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 		dev_replace->replace_state =
 			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
-		dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+		dev_replace->time_stopped = get_seconds();
 		dev_replace->item_needs_writeback = 1;
 		pr_info("btrfs: suspending dev_replace for unmount\n");
 		break;

From 1877e1a747db33d7264d8046e96373962da72a3d Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Wed, 16 Oct 2013 12:10:33 -0700
Subject: [PATCH 60/98] btrfs: remove move_pages()

move_pages() has an inefficient backwards byte copy of regions of two
different pages.  They're different pages so the regions won't overlap
and it could use memcpy().

At that point, though, move_pages() would be a slightly dimmer
re-implementation of copy_pages() that lacked the test for overlapping
page regions.

So remove move_pages() and just call copy_pages().

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_io.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5439f24ab06b..5bf98d2ee70f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -5082,23 +5082,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	}
 }
 
-static void move_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
-{
-	char *dst_kaddr = page_address(dst_page);
-	if (dst_page == src_page) {
-		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
-	} else {
-		char *src_kaddr = page_address(src_page);
-		char *p = dst_kaddr + dst_off + len;
-		char *s = src_kaddr + src_off + len;
-
-		while (len--)
-			*--p = *--s;
-	}
-}
-
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
 {
 	unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5209,7 +5192,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
-		move_pages(extent_buffer_page(dst, dst_i),
+		copy_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
 			   src_off_in_page - cur + 1, cur);

From 8b558c5f097b636209b654f4d7775ac96054d6e3 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Wed, 16 Oct 2013 12:10:34 -0700
Subject: [PATCH 61/98] btrfs: remove fs/btrfs/compat.h

fs/btrfs/compat.h only contained trivial macro wrappers of drop_nlink()
and inc_nlink().  This doesn't belong in mainline.

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/compat.h      | 7 -------
 fs/btrfs/compression.c | 1 -
 fs/btrfs/dev-replace.c | 1 -
 fs/btrfs/disk-io.c     | 1 -
 fs/btrfs/export.c      | 1 -
 fs/btrfs/extent-tree.c | 1 -
 fs/btrfs/extent_io.c   | 1 -
 fs/btrfs/file.c        | 1 -
 fs/btrfs/inode.c       | 5 ++---
 fs/btrfs/ioctl.c       | 1 -
 fs/btrfs/raid56.c      | 1 -
 fs/btrfs/super.c       | 1 -
 fs/btrfs/tree-log.c    | 9 ++++-----
 fs/btrfs/volumes.c     | 1 -
 14 files changed, 6 insertions(+), 26 deletions(-)
 delete mode 100644 fs/btrfs/compat.h

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
deleted file mode 100644
index 7c4503ef6efd..000000000000
--- a/fs/btrfs/compat.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
-
-#define btrfs_drop_nlink(inode) drop_nlink(inode)
-#define btrfs_inc_nlink(inode)	inc_nlink(inode)
-
-#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6aad98cb343f..df019c7f58b2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/slab.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a36343a2dd68..cb94310db072 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -26,7 +26,6 @@
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ebc784aba3f5..62c4aba221bb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -33,7 +33,6 @@
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
 #include <asm/unaligned.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 4b8691607373..41422a3de8ed 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
 #include "btrfs_inode.h"
 #include "print-tree.h"
 #include "export.h"
-#include "compat.h"
 
 #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
 						 parent_objectid) / 4)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c4971108f126..4062a65f9fb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
-#include "compat.h"
 #include "hash.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5bf98d2ee70f..a93bab452183 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,7 +13,6 @@
 #include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
-#include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bf3465cf455a..14b41d569a3e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,7 +39,6 @@
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
-#include "compat.h"
 #include "volumes.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9d39d08a1555..14c6ab74cf8c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,7 +43,6 @@
 #include <linux/btrfs.h>
 #include <linux/blkdev.h>
 #include <linux/posix_acl_xattr.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -3645,7 +3644,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	int ret;
 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 	if (!ret) {
-		btrfs_drop_nlink(inode);
+		drop_nlink(inode);
 		ret = btrfs_update_inode(trans, root, inode);
 	}
 	return ret;
@@ -5738,7 +5737,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 	}
 
-	btrfs_inc_nlink(inode);
+	inc_nlink(inode);
 	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 864aab4b1bfa..9ea05506cb84 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -44,7 +44,6 @@
 #include <linux/uuid.h>
 #include <linux/btrfs.h>
 #include <linux/uaccess.h>
-#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d0ecfbd9cc9f..24ac21840a9a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -33,7 +33,6 @@
 #include <linux/raid/xor.h>
 #include <linux/vmalloc.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 02f552f11829..c9079db7441e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -42,7 +42,6 @@
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
 #include <linux/btrfs.h>
-#include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d45c04b30690..f98002ed32b1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,7 +26,6 @@
 #include "locking.h"
 #include "print-tree.h"
 #include "backref.h"
-#include "compat.h"
 #include "tree-log.h"
 #include "hash.h"
 
@@ -936,7 +935,7 @@ again:
 					    parent_objectid,
 					    victim_name,
 					    victim_name_len)) {
-				btrfs_inc_nlink(inode);
+				inc_nlink(inode);
 				btrfs_release_path(path);
 
 				ret = btrfs_unlink_inode(trans, root, dir,
@@ -1006,7 +1005,7 @@ again:
 				victim_parent = read_one_inode(root,
 							       parent_objectid);
 				if (victim_parent) {
-					btrfs_inc_nlink(inode);
+					inc_nlink(inode);
 					btrfs_release_path(path);
 
 					ret = btrfs_unlink_inode(trans, root,
@@ -1490,7 +1489,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 		if (!inode->i_nlink)
 			set_nlink(inode, 1);
 		else
-			btrfs_inc_nlink(inode);
+			inc_nlink(inode);
 		ret = btrfs_update_inode(trans, root, inode);
 	} else if (ret == -EEXIST) {
 		ret = 0;
@@ -1851,7 +1850,7 @@ again:
 				goto out;
 			}
 
-			btrfs_inc_nlink(inode);
+			inc_nlink(inode);
 			ret = btrfs_unlink_inode(trans, root, dir, inode,
 						 name, name_len);
 			if (!ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f492f7eabf29..716abee16a5b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -28,7 +28,6 @@
 #include <linux/raid/pq.h>
 #include <linux/semaphore.h>
 #include <asm/div64.h>
-#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"

From cab45e22da48b523665b3566f43527a2fead1d1e Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 16 Oct 2013 16:27:01 -0400
Subject: [PATCH 62/98] btrfs: add tracing for failed reservations

When debugging ENOSPC issues, it's nice to be able to see which
reservations failed as well as the ones which succeeded.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4062a65f9fb7..a807acc9a63e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3684,6 +3684,9 @@ commit_trans:
 			goto again;
 		}
 
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      data_sinfo->flags, bytes, 1);
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
@@ -4335,6 +4338,10 @@ out:
 		    !block_rsv_use_bytes(global_rsv, orig_bytes))
 			ret = 0;
 	}
+	if (ret == -ENOSPC)
+		trace_btrfs_space_reservation(root->fs_info,
+					      "space_info:enospc",
+					      space_info->flags, orig_bytes, 1);
 	if (flushing) {
 		spin_lock(&space_info->lock);
 		space_info->flush = 0;

From 55994887082d6e9b3d31b55fc5be158fd97eadbc Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Fri, 18 Oct 2013 15:42:56 +0100
Subject: [PATCH 63/98] Btrfs: optimize extent item search in
 run_delayed_extent_op

Instead of doing another extent tree search if the first search failed
to find a metadata item, check if the previous item in the leaf is an
extent item and use it if it is, otherwise do the second tree search
for an extent item.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a807acc9a63e..4595a65db729 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2130,15 +2130,28 @@ again:
 	}
 	if (ret > 0) {
 		if (metadata) {
-			btrfs_release_path(path);
-			metadata = 0;
+			if (path->slots[0] > 0) {
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == node->bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == node->num_bytes)
+					ret = 0;
+			}
+			if (ret > 0) {
+				btrfs_release_path(path);
+				metadata = 0;
 
-			key.offset = node->num_bytes;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-			goto again;
+				key.objectid = node->bytenr;
+				key.offset = node->num_bytes;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				goto again;
+			}
+		} else {
+			err = -EIO;
+			goto out;
 		}
-		err = -EIO;
-		goto out;
 	}
 
 	leaf = path->nodes[0];

From a5f519c91df8d81b3e6ed3489806ba66f6b82ca7 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Mon, 21 Oct 2013 14:18:17 +0200
Subject: [PATCH 64/98] Btrfs: fix check_int 'leaf item out of bounce'
 regression

Yet another cleanup patch broke code for which no xfstest exists.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/check-integrity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 1c47be187240..3281f7ebc0f7 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1038,7 +1038,7 @@ leaf_item_out_of_bounce_error:
 						     disk_item_offset,
 						     sizeof(struct btrfs_item));
 			item_offset = btrfs_stack_item_offset(&disk_item);
-			item_size = btrfs_stack_item_offset(&disk_item);
+			item_size = btrfs_stack_item_size(&disk_item);
 			disk_key = &disk_item.key;
 			type = btrfs_disk_key_type(disk_key);
 

From 301993a4a13b34c13f7f2a7ed3429d231a6d6166 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Mon, 21 Oct 2013 18:46:58 +0200
Subject: [PATCH 65/98] Btrfs: check_int, remove warning for mixed-mode

In mixed-mode, when a data-block was later reused for metadata, a
warning was printed. This condition is now filtered out and the
warning is eliminated in this case.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/check-integrity.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 3281f7ebc0f7..656b07637e39 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1900,7 +1900,9 @@ again:
 							       dev_state,
 							       dev_bytenr);
 			}
-			if (block->logical_bytenr != bytenr) {
+			if (block->logical_bytenr != bytenr &&
+			    !(!block->is_metadata &&
+			      block->logical_bytenr == 0))
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/%d)"
 				       " found in hash table, %c,"
@@ -1910,15 +1912,14 @@ again:
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block),
 				       block->logical_bytenr);
-				block->logical_bytenr = bytenr;
-			} else if (state->print_mask &
-				   BTRFSIC_PRINT_MASK_VERBOSE)
+			else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/%d)"
 				       " found in hash table, %c.\n",
 				       bytenr, dev_state->name, dev_bytenr,
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block));
+			block->logical_bytenr = bytenr;
 		} else {
 			if (num_pages * PAGE_CACHE_SIZE <
 			    state->datablock_size) {

From ed2590953bd06b892f0411fc94e19175d32f197a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 11:36:01 -0400
Subject: [PATCH 66/98] Btrfs: stop using vfs_read in send

Apparently we don't actually close the files until we return to userspace, so
stop using vfs_read in send.  This is actually better for us since we can avoid
all the extra logic of holding the file we're sending open and making sure to
clean it up.  This will fix people who have been hitting too many files open
errors when trying to send.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/send.c | 175 ++++++++++++++++++++----------------------------
 1 file changed, 72 insertions(+), 103 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0a894399be16..e26a3a62fd3f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -121,7 +121,6 @@ struct send_ctx {
 	struct list_head name_cache_list;
 	int name_cache_size;
 
-	struct file *cur_inode_filp;
 	char *read_buf;
 };
 
@@ -2119,77 +2118,6 @@ out:
 	return ret;
 }
 
-/*
- * Called for regular files when sending extents data. Opens a struct file
- * to read from the file.
- */
-static int open_cur_inode_file(struct send_ctx *sctx)
-{
-	int ret = 0;
-	struct btrfs_key key;
-	struct path path;
-	struct inode *inode;
-	struct dentry *dentry;
-	struct file *filp;
-	int new = 0;
-
-	if (sctx->cur_inode_filp)
-		goto out;
-
-	key.objectid = sctx->cur_ino;
-	key.type = BTRFS_INODE_ITEM_KEY;
-	key.offset = 0;
-
-	inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
-			&new);
-	if (IS_ERR(inode)) {
-		ret = PTR_ERR(inode);
-		goto out;
-	}
-
-	dentry = d_obtain_alias(inode);
-	inode = NULL;
-	if (IS_ERR(dentry)) {
-		ret = PTR_ERR(dentry);
-		goto out;
-	}
-
-	path.mnt = sctx->mnt;
-	path.dentry = dentry;
-	filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
-	dput(dentry);
-	dentry = NULL;
-	if (IS_ERR(filp)) {
-		ret = PTR_ERR(filp);
-		goto out;
-	}
-	sctx->cur_inode_filp = filp;
-
-out:
-	/*
-	 * no xxxput required here as every vfs op
-	 * does it by itself on failure
-	 */
-	return ret;
-}
-
-/*
- * Closes the struct file that was created in open_cur_inode_file
- */
-static int close_cur_inode_file(struct send_ctx *sctx)
-{
-	int ret = 0;
-
-	if (!sctx->cur_inode_filp)
-		goto out;
-
-	ret = filp_close(sctx->cur_inode_filp, NULL);
-	sctx->cur_inode_filp = NULL;
-
-out:
-	return ret;
-}
-
 /*
  * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
  */
@@ -3622,6 +3550,72 @@ out:
 	return ret;
 }
 
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+	struct btrfs_root *root = sctx->send_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct inode *inode;
+	struct page *page;
+	char *addr;
+	struct btrfs_key key;
+	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+	pgoff_t last_index;
+	unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+	ssize_t ret = 0;
+
+	key.objectid = sctx->cur_ino;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	if (offset + len > i_size_read(inode)) {
+		if (offset > i_size_read(inode))
+			len = 0;
+		else
+			len = offset - i_size_read(inode);
+	}
+	if (len == 0)
+		goto out;
+
+	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	while (index <= last_index) {
+		unsigned cur_len = min_t(unsigned, len,
+					 PAGE_CACHE_SIZE - pg_offset);
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				break;
+			}
+		}
+
+		addr = kmap(page);
+		memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+		pg_offset = 0;
+		len -= cur_len;
+		ret += cur_len;
+	}
+out:
+	iput(inode);
+	return ret;
+}
+
 /*
  * Read some bytes from the current inode/file and send a write command to
  * user space.
@@ -3630,35 +3624,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
 {
 	int ret = 0;
 	struct fs_path *p;
-	loff_t pos = offset;
-	int num_read = 0;
-	mm_segment_t old_fs;
+	ssize_t num_read = 0;
 
 	p = fs_path_alloc();
 	if (!p)
 		return -ENOMEM;
 
-	/*
-	 * vfs normally only accepts user space buffers for security reasons.
-	 * we only read from the file and also only provide the read_buf buffer
-	 * to vfs. As this buffer does not come from a user space call, it's
-	 * ok to temporary allow kernel space buffers.
-	 */
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-
 verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 
-	ret = open_cur_inode_file(sctx);
-	if (ret < 0)
-		goto out;
-
-	ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
-	if (ret < 0)
-		goto out;
-	num_read = ret;
-	if (!num_read)
+	num_read = fill_read_buf(sctx, offset, len);
+	if (num_read <= 0) {
+		if (num_read < 0)
+			ret = num_read;
 		goto out;
+	}
 
 	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
 	if (ret < 0)
@@ -3677,7 +3656,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
 tlv_put_failure:
 out:
 	fs_path_free(p);
-	set_fs(old_fs);
 	if (ret < 0)
 		return ret;
 	return num_read;
@@ -4222,10 +4200,6 @@ static int changed_inode(struct send_ctx *sctx,
 	u64 left_gen = 0;
 	u64 right_gen = 0;
 
-	ret = close_cur_inode_file(sctx);
-	if (ret < 0)
-		goto out;
-
 	sctx->cur_ino = key->objectid;
 	sctx->cur_inode_new_gen = 0;
 
@@ -4686,11 +4660,6 @@ static int send_subvol(struct send_ctx *sctx)
 	}
 
 out:
-	if (!ret)
-		ret = close_cur_inode_file(sctx);
-	else
-		close_cur_inode_file(sctx);
-
 	free_recorded_refs(sctx);
 	return ret;
 }

From 0ef8b726075aa6931ddf1c16f5bae043eef184f9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 16:13:35 -0400
Subject: [PATCH 67/98] Btrfs: return an error from btrfs_wait_ordered_range

I noticed that if the free space cache has an error writing out it's data it
won't actually error out, it will just carry on.  This is because it doesn't
check the return value of btrfs_wait_ordered_range, which didn't actually return
anything.  So fix this in order to keep us from making free space cache look
valid when it really isnt.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/file.c             | 66 ++++++++++++++++++++++---------------
 fs/btrfs/free-space-cache.c |  9 +++--
 fs/btrfs/inode.c            |  9 +++--
 fs/btrfs/ordered-data.c     | 26 ++++++++++-----
 fs/btrfs/ordered-data.h     |  2 +-
 fs/btrfs/relocation.c       |  7 +++-
 6 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 14b41d569a3e..3a20a12513b2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1280,6 +1280,7 @@ again:
 		}
 		wait_on_page_writeback(pages[i]);
 	}
+	faili = num_pages - 1;
 	err = 0;
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
@@ -1298,8 +1299,10 @@ again:
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
-			btrfs_wait_ordered_range(inode, start_pos,
-						 last_pos - start_pos);
+			err = btrfs_wait_ordered_range(inode, start_pos,
+						       last_pos - start_pos);
+			if (err)
+				goto fail;
 			goto again;
 		}
 		if (ordered)
@@ -1808,8 +1811,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	atomic_inc(&root->log_batch);
 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &BTRFS_I(inode)->runtime_flags);
-	if (full_sync)
-		btrfs_wait_ordered_range(inode, start, end - start + 1);
+	if (full_sync) {
+		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
 	atomic_inc(&root->log_batch);
 
 	/*
@@ -1875,27 +1883,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
-		if (ret > 0) {
-			/*
-			 * If we didn't already wait for ordered extents we need
-			 * to do that now.
-			 */
-			if (!full_sync)
-				btrfs_wait_ordered_range(inode, start,
-							 end - start + 1);
-			ret = btrfs_commit_transaction(trans, root);
-		} else {
+		if (!ret) {
 			ret = btrfs_sync_log(trans, root);
-			if (ret == 0) {
+			if (!ret) {
 				ret = btrfs_end_transaction(trans, root);
-			} else {
-				if (!full_sync)
-					btrfs_wait_ordered_range(inode, start,
-								 end -
-								 start + 1);
-				ret = btrfs_commit_transaction(trans, root);
+				goto out;
 			}
 		}
+		if (!full_sync) {
+			ret = btrfs_wait_ordered_range(inode, start,
+						       end - start + 1);
+			if (ret)
+				goto out;
+		}
+		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_end_transaction(trans, root);
 	}
@@ -2066,7 +2067,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 
-	btrfs_wait_ordered_range(inode, offset, len);
+	ret = btrfs_wait_ordered_range(inode, offset, len);
+	if (ret)
+		return ret;
 
 	mutex_lock(&inode->i_mutex);
 	/*
@@ -2135,8 +2138,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			btrfs_put_ordered_extent(ordered);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
 				     lockend, &cached_state, GFP_NOFS);
-		btrfs_wait_ordered_range(inode, lockstart,
-					 lockend - lockstart + 1);
+		ret = btrfs_wait_ordered_range(inode, lockstart,
+					       lockend - lockstart + 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
 	}
 
 	path = btrfs_alloc_path();
@@ -2307,7 +2314,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 	 * wait for ordered IO before we have any locks.  We'll loop again
 	 * below with the locks held.
 	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+	ret = btrfs_wait_ordered_range(inode, alloc_start,
+				       alloc_end - alloc_start);
+	if (ret)
+		goto out;
 
 	locked_end = alloc_end - 1;
 	while (1) {
@@ -2331,8 +2341,10 @@ static long btrfs_fallocate(struct file *file, int mode,
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
 			 */
-			btrfs_wait_ordered_range(inode, alloc_start,
-						 alloc_end - alloc_start);
+			ret = btrfs_wait_ordered_range(inode, alloc_start,
+						       alloc_end - alloc_start);
+			if (ret)
+				goto out;
 		} else {
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4772f3a8e7e5..d7c445c30a16 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1008,8 +1008,13 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (ret)
 		goto out;
 
-
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (ret) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
+		goto out;
+	}
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 14c6ab74cf8c..f1fbf903bf9b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7236,7 +7236,9 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	 * outstanding dirty pages are on disk.
 	 */
 	count = iov_length(iov, nr_segs);
-	btrfs_wait_ordered_range(inode, offset, count);
+	ret = btrfs_wait_ordered_range(inode, offset, count);
+	if (ret)
+		return ret;
 
 	if (rw & WRITE) {
 		/*
@@ -7577,7 +7579,10 @@ static int btrfs_truncate(struct inode *inode)
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+				       (u64)-1);
+	if (ret)
+		return ret;
 
 	/*
 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1a36a0c3ee27..bbb1a3864686 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -734,8 +734,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 /*
  * Used to wait on ordered extents across a large range of bytes.
  */
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
+	int ret = 0;
 	u64 end;
 	u64 orig_end;
 	struct btrfs_ordered_extent *ordered;
@@ -751,8 +752,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+	if (ret)
+		return ret;
 	/*
 	 * So with compression we will find and lock a dirty page and clear the
 	 * first one as dirty, setup an async extent, and immediately return
@@ -768,10 +770,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	 * right and you are wrong.
 	 */
 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-		     &BTRFS_I(inode)->runtime_flags))
-		filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+		     &BTRFS_I(inode)->runtime_flags)) {
+		ret = filemap_fdatawrite_range(inode->i_mapping, start,
+					       orig_end);
+		if (ret)
+			return ret;
+	}
+	ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+	if (ret)
+		return ret;
 
 	end = orig_end;
 	while (1) {
@@ -788,11 +795,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		}
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
+		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+			ret = -EIO;
 		btrfs_put_ordered_extent(ordered);
-		if (end == 0 || end == start)
+		if (ret || end == 0 || end == start)
 			break;
 		end--;
 	}
+	return ret;
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 0c0b35612d7a..3982db1c1b36 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -180,7 +180,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a945374993f7..729c91e151a3 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4257,7 +4257,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 			rc->extents_found);
 
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
-			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+						       (u64)-1);
+			if (ret) {
+				err = ret;
+				goto out;
+			}
 			invalidate_mapping_pages(rc->data_inode->i_mapping,
 						 0, -1);
 			rc->stage = UPDATE_DATA_PTRS;

From 02ecd2c278471a3a7fd255662df92861c6759ebf Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 16:19:08 -0400
Subject: [PATCH 68/98] Btrfs: do not bug_on if we try to cow a free space
 cache inode

We can just return an error and we'll bail out properly.  We still want to catch
this case to make sure we don't have a bug somewhere, so just warn if this pops
up.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1fbf903bf9b..a44ca6a4feff 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -843,7 +843,10 @@ static noinline int cow_file_range(struct inode *inode,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
-	BUG_ON(btrfs_is_free_space_inode(inode));
+	if (btrfs_is_free_space_inode(inode)) {
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
 
 	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);

From d788a34929a90f70b945c100cfe9efd4d49fb446 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Fri, 25 Oct 2013 16:55:08 -0400
Subject: [PATCH 69/98] Btrfs: don't abort transaction in run_delalloc_nocow

This is just the write path, the only reason we start a transaction is so we can
check cross references, we don't make any actual changes, so there is no reason
to abort the transaction if we fail.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a44ca6a4feff..5916ad8681cd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1180,10 +1180,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	while (1) {
 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
 					       cur_offset, 0);
-		if (ret < 0) {
-			btrfs_abort_transaction(trans, root, ret);
+		if (ret < 0)
 			goto error;
-		}
 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1197,10 +1195,8 @@ next_slot:
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
-			if (ret < 0) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret < 0)
 				goto error;
-			}
 			if (ret > 0)
 				break;
 			leaf = path->nodes[0];
@@ -1291,10 +1287,8 @@ out_check:
 			ret = cow_file_range(inode, locked_page,
 					     cow_start, found_key.offset - 1,
 					     page_started, nr_written, 1);
-			if (ret) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret)
 				goto error;
-			}
 			cow_start = (u64)-1;
 		}
 
@@ -1341,10 +1335,8 @@ out_check:
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
 						      num_bytes);
-			if (ret) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret)
 				goto error;
-			}
 		}
 
 		extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1366,10 +1358,8 @@ out_check:
 	if (cow_start != (u64)-1) {
 		ret = cow_file_range(inode, locked_page, cow_start, end,
 				     page_started, nr_written, 1);
-		if (ret) {
-			btrfs_abort_transaction(trans, root, ret);
+		if (ret)
 			goto error;
-		}
 	}
 
 error:

From 93858769172c4e3678917810e9d5de360eb991cc Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 28 Oct 2013 09:13:25 -0400
Subject: [PATCH 70/98] Btrfs: take ordered root lock when removing ordered
 operations inode

A user reported a list corruption warning from btrfs_remove_ordered_extent, it
is because we aren't taking the ordered_root_lock when we remove the inode from
the ordered operations list.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ordered-data.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index bbb1a3864686..8a5eff366596 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -537,7 +537,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	 */
 	if (RB_EMPTY_ROOT(&tree->tree) &&
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+		spin_lock(&root->fs_info->ordered_root_lock);
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 	}
 
 	if (!root->nr_ordered_extents) {

From 9f23e289edaf1e99f60de3978c50a1c7424f3e92 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 28 Oct 2013 15:03:41 -0400
Subject: [PATCH 71/98] Btrfs: make sure the delalloc workers actually flush
 compressed writes

When using delalloc workers in a non-waiting way (like for enospc handling) we
can end up not actually waiting for the dirty pages to be started if we have
compression.  We need to add an extra filemap flush to make sure any async
extents that have started are actually moved along before returning.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5916ad8681cd..c84adde53023 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8163,18 +8163,24 @@ out_notrans:
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
+	struct inode *inode;
 
 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
 				     work);
-	if (delalloc_work->wait)
-		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
-	else
-		filemap_flush(delalloc_work->inode->i_mapping);
+	inode = delalloc_work->inode;
+	if (delalloc_work->wait) {
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	} else {
+		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
 
 	if (delalloc_work->delay_iput)
-		btrfs_add_delayed_iput(delalloc_work->inode);
+		btrfs_add_delayed_iput(inode);
 	else
-		iput(delalloc_work->inode);
+		iput(inode);
 	complete(&delalloc_work->completion);
 }
 

From 9e6a0c52b74b2d63a6cdb09cec5eaf66038b218f Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 31 Oct 2013 10:07:19 -0400
Subject: [PATCH 72/98] Btrfs: stop committing the transaction so much during
 relocate

I noticed with my horrible snapshot excercisor that we were taking forever to
relocate the larger the file system got.  This appeared to be because we were
committing the transaction _constantly_.  There were a few places where we do
braindead things with metadata reservation, like start a transaction and then
try to refill the block rsv, which not only keeps us from committing a
transaction during the enospc stuff, but keeps us from doing some of the harder
flushing work which will make us more likely to need to commit the transaction.
We also were checking the block rsv and committing the transaction if the block
rsv was below a certain threshold, but we were doing this in a place where we
don't actually keep anything in the block rsv so this was always ending up false
so we always committed the transaction in this case.  I tested this to make sure
it didn't break anything, but it takes about 10 hours to get the box to this
state so I don't know how much of an impact it will really make.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 729c91e151a3..92eb4832ff86 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2060,7 +2060,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	LIST_HEAD(inode_list);
 	struct btrfs_key key;
 	struct btrfs_key next_key;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
 	struct btrfs_path *path;
@@ -2109,18 +2109,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	memset(&next_key, 0, sizeof(next_key));
 
 	while (1) {
-		trans = btrfs_start_transaction(root, 0);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = rc->block_rsv;
-
 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
 					     BTRFS_RESERVE_FLUSH_ALL);
 		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			continue;
+			err = ret;
+			goto out;
 		}
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
+		}
+		trans->block_rsv = rc->block_rsv;
 
 		replaced = 0;
 		max_level = level;
@@ -2166,6 +2167,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		root_item->drop_level = level;
 
 		btrfs_end_transaction_throttle(trans, root);
+		trans = NULL;
 
 		btrfs_btree_balance_dirty(root);
 
@@ -2194,7 +2196,8 @@ out:
 		btrfs_update_reloc_root(trans, root);
 	}
 
-	btrfs_end_transaction_throttle(trans, root);
+	if (trans)
+		btrfs_end_transaction_throttle(trans, root);
 
 	btrfs_btree_balance_dirty(root);
 
@@ -3994,16 +3997,6 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
-		if (ret < 0) {
-			if (ret != -ENOSPC) {
-				err = ret;
-				WARN_ON(1);
-				break;
-			}
-			rc->commit_transaction = 1;
-		}
-
 		if (rc->commit_transaction) {
 			rc->commit_transaction = 0;
 			ret = btrfs_commit_transaction(trans, rc->extent_root);

From 9dced186f934648bb7796ad3301cfc2563e2ad6e Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 25 Oct 2013 17:33:36 +0800
Subject: [PATCH 73/98] Btrfs: fix the free space write out failure when there
 is no data space

After running space balance on a new fs, the fs check program outputed the
following warning message:
 free space inode generation (0) did not match free space cache generation (20)

Steps to reproduce:
 # mkfs.btrfs -f <dev>
 # mount <dev> <mnt>
 # btrfs balance start <mnt>
 # umount <mnt>
 # btrfs check <dev>

It was because there was no data space after the space balance, and the free
space write out task didn't try to allocate a new data chunk for the free space
inode when doing the reservation. So the data space reservation failed, and in
order to tell the free space loader that this free space inode could not be
trusted, the generation of the free space inode wasn't updated. Then the check
program found this problem and outputed the above message.

But in fact, it is safe that we try to allocate a new data chunk when we find
the data space is not enough. The patch fixes the above problem by this way.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4595a65db729..3aa52701d04a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3621,10 +3621,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
 	/* make sure bytes are sectorsize aligned */
 	bytes = ALIGN(bytes, root->sectorsize);
 
-	if (root == root->fs_info->tree_root ||
-	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
-		alloc_chunk = 0;
+	if (btrfs_is_free_space_inode(inode)) {
 		committed = 1;
+		ASSERT(current->journal_info);
 	}
 
 	data_sinfo = fs_info->data_sinfo;
@@ -3652,6 +3651,16 @@ again:
 			spin_unlock(&data_sinfo->lock);
 alloc:
 			alloc_target = btrfs_get_alloc_profile(root, 1);
+			/*
+			 * It is ugly that we don't call nolock join
+			 * transaction for the free space inode case here.
+			 * But it is safe because we only do the data space
+			 * reservation for the free space cache in the
+			 * transaction context, the common join transaction
+			 * just increase the counter of the current transaction
+			 * handler, doesn't try to acquire the trans_lock of
+			 * the fs.
+			 */
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);

From 7fdf4b608dda5eea114cb23623b52e34dd5972f5 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 25 Oct 2013 18:52:08 +0800
Subject: [PATCH 74/98] Btrfs: use 'u64' rather than 'int' to get extent's
 generation

We define a 'int' to get extent's generation by mistake,fix it.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/relocation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 92eb4832ff86..a5f6a8061b21 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3263,7 +3263,7 @@ static int add_tree_block(struct reloc_control *rc,
 	struct rb_node *rb_node;
 	u32 item_size;
 	int level = -1;
-	int generation;
+	u64 generation;
 
 	eb =  path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);

From 9b011adfe14977fcda977234609d43ca52463a3d Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Fri, 25 Oct 2013 19:12:02 +0800
Subject: [PATCH 75/98] Btrfs: remove scrub_super_lock holding in
 btrfs_sync_log()

Originally, we introduced scrub_super_lock to synchronize
tree log code with scrubbing super.

However we can replace scrub_super_lock with device_list_mutex,
because writing super will hold this mutex, this will reduce an extra
lock holding when writing supers in sync log code.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.h    |  3 ---
 fs/btrfs/disk-io.c  |  1 -
 fs/btrfs/scrub.c    | 18 +++++-------------
 fs/btrfs/tree-log.c |  2 --
 fs/btrfs/volumes.h  |  4 +++-
 5 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9f5e1cfb0209..34279665fd69 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1586,7 +1586,6 @@ struct btrfs_fs_info {
 	atomic_t scrubs_paused;
 	atomic_t scrub_cancel_req;
 	wait_queue_head_t scrub_pause_wait;
-	struct rw_semaphore scrub_super_lock;
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
 	struct btrfs_workers scrub_wr_completion_workers;
@@ -3950,9 +3949,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace);
 void btrfs_scrub_pause(struct btrfs_root *root);
-void btrfs_scrub_pause_super(struct btrfs_root *root);
 void btrfs_scrub_continue(struct btrfs_root *root);
-void btrfs_scrub_continue_super(struct btrfs_root *root);
 int btrfs_scrub_cancel(struct btrfs_fs_info *info);
 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 			   struct btrfs_device *dev);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62c4aba221bb..a3ad3a27944d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2250,7 +2250,6 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->scrubs_paused, 0);
 	atomic_set(&fs_info->scrub_cancel_req, 0);
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
-	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	fs_info->check_integrity_print_mask = 0;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f21e2df89bc2..12009b4279ad 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2932,13 +2932,15 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	if (!is_dev_replace) {
-		down_read(&fs_info->scrub_super_lock);
+		/*
+		 * by holding device list mutex, we can
+		 * kick off writing super in log tree sync.
+		 */
 		ret = scrub_supers(sctx, dev);
-		up_read(&fs_info->scrub_super_lock);
 	}
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
 	if (!ret)
 		ret = scrub_enumerate_chunks(sctx, dev, start, end,
@@ -2988,16 +2990,6 @@ void btrfs_scrub_continue(struct btrfs_root *root)
 	wake_up(&fs_info->scrub_pause_wait);
 }
 
-void btrfs_scrub_pause_super(struct btrfs_root *root)
-{
-	down_write(&root->fs_info->scrub_super_lock);
-}
-
-void btrfs_scrub_continue_super(struct btrfs_root *root)
-{
-	up_write(&root->fs_info->scrub_super_lock);
-}
-
 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
 	mutex_lock(&fs_info->scrub_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f98002ed32b1..257cbae8c948 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2586,9 +2586,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * the running transaction open, so a full commit can't hop
 	 * in and cause problems either.
 	 */
-	btrfs_scrub_pause_super(root);
 	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
-	btrfs_scrub_continue_super(root);
 	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_wake_log_root;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b72f540c8b29..49e0ff01143f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -132,7 +132,9 @@ struct btrfs_fs_devices {
 
 	/* all of the devices in the FS, protected by a mutex
 	 * so we can safely walk it to write out the supers without
-	 * worrying about add/remove by the multi-device code
+	 * worrying about add/remove by the multi-device code.
+	 * Scrubbing super can kick off supers writing by holding
+	 * this mutex lock.
 	 */
 	struct mutex device_list_mutex;
 	struct list_head devices;

From e46f5388cdde0f402f2dded6f1cad614536c6429 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 28 Oct 2013 16:28:30 +0000
Subject: [PATCH 76/98] Btrfs: fix verification of dir_item

We were ignoring the name component of the dir_item. Both the
name and data must fit within BTRFS_MAX_XATTR_SIZE(root).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dir-item.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1c529db1697f..c031ea3fd70f 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -474,8 +474,10 @@ int verify_dir_item(struct btrfs_root *root,
 	}
 
 	/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
-	if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
-		printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+	if ((btrfs_dir_data_len(leaf, dir_item) +
+	     btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+		printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n",
+		       (unsigned)btrfs_dir_name_len(leaf, dir_item),
 		       (unsigned)btrfs_dir_data_len(leaf, dir_item));
 		return 1;
 	}

From 488111aa0e17102d42e0328299fd782dc6a4a051 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 28 Oct 2013 16:30:29 +0000
Subject: [PATCH 77/98] Btrfs: fix csum search offset/length calculation in log
 tree

We were setting the csums search offset and length to the right values if
the extent is compressed, but later on right before doing the csums lookup
we were overriding these two parameters regardless of compression being
set or not for the extent.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 257cbae8c948..18891c009641 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3421,11 +3421,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	if (skip_csum)
 		return 0;
 
-	if (em->compress_type) {
-		csum_offset = 0;
-		csum_len = block_len;
-	}
-
 	/*
 	 * First check and see if our csums are on our outstanding ordered
 	 * extents.
@@ -3509,8 +3504,13 @@ unlocked:
 	if (!mod_len || ret)
 		return ret;
 
-	csum_offset = mod_start - em->start;
-	csum_len = mod_len;
+	if (em->compress_type) {
+		csum_offset = 0;
+		csum_len = block_len;
+	} else {
+		csum_offset = mod_start - em->start;
+		csum_len = mod_len;
+	}
 
 	/* block start is already adjusted for the file extent offset. */
 	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,

From 269d040ff2f340e1ed517dab8b66b2f133c83a77 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Mon, 28 Oct 2013 17:39:21 +0000
Subject: [PATCH 78/98] Btrfs: log recovery, don't unlink inode always on error

If we get any error while doing a dir index/item lookup in the
log tree, we were always unlinking the corresponding inode in
the subvolume. It makes sense to unlink only if the lookup failed
to find the dir index/item, which corresponds to NULL or -ENOENT,
and not when other errors happen (like a transient -ENOMEM or -EIO).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/tree-log.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 18891c009641..7927a5fa755c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1832,7 +1832,7 @@ again:
 						     dir_key->offset,
 						     name, name_len, 0);
 		}
-		if (IS_ERR_OR_NULL(log_di)) {
+		if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
 			btrfs_dir_item_key_to_cpu(eb, di, &location);
 			btrfs_release_path(path);
 			btrfs_release_path(log_path);
@@ -1869,6 +1869,9 @@ again:
 				goto again;
 			ret = 0;
 			goto out;
+		} else if (IS_ERR(log_di)) {
+			kfree(name);
+			return PTR_ERR(log_di);
 		}
 		btrfs_release_path(log_path);
 		kfree(name);

From 6f519564d7d978c00351d9ab6abac3deeac31621 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 29 Oct 2013 10:45:05 +0800
Subject: [PATCH 79/98] Btrfs: do not run snapshot-aware defragment on error

If something wrong happens in write endio, running snapshot-aware defragment
can end up with undefined results, maybe a crash, so we should avoid it.

In order to share similar code, this also adds a helper to free the struct for
snapshot-aware defrag.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c84adde53023..17800221978f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2365,10 +2365,23 @@ out_unlock:
 	return ret;
 }
 
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+	struct old_sa_defrag_extent *old, *tmp;
+
+	if (!new)
+		return;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+	kfree(new);
+}
+
 static void relink_file_extents(struct new_sa_defrag_extent *new)
 {
 	struct btrfs_path *path;
-	struct old_sa_defrag_extent *old, *tmp;
 	struct sa_defrag_extent_backref *backref;
 	struct sa_defrag_extent_backref *prev = NULL;
 	struct inode *inode;
@@ -2411,16 +2424,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
 	kfree(prev);
 
 	btrfs_free_path(path);
-
-	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
-		kfree(old);
-	}
 out:
+	free_sa_defrag_extent(new);
+
 	atomic_dec(&root->fs_info->defrag_running);
 	wake_up(&root->fs_info->transaction_wait);
-
-	kfree(new);
 }
 
 static struct new_sa_defrag_extent *
@@ -2430,7 +2438,7 @@ record_old_file_extents(struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct old_sa_defrag_extent *old, *tmp;
+	struct old_sa_defrag_extent *old;
 	struct new_sa_defrag_extent *new;
 	int ret;
 
@@ -2478,7 +2486,7 @@ record_old_file_extents(struct inode *inode,
 		if (slot >= btrfs_header_nritems(l)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				goto out_free_list;
+				goto out_free_path;
 			else if (ret > 0)
 				break;
 			continue;
@@ -2507,7 +2515,7 @@ record_old_file_extents(struct inode *inode,
 
 		old = kmalloc(sizeof(*old), GFP_NOFS);
 		if (!old)
-			goto out_free_list;
+			goto out_free_path;
 
 		offset = max(new->file_pos, key.offset);
 		end = min(new->file_pos + new->len, key.offset + num_bytes);
@@ -2529,15 +2537,10 @@ next:
 
 	return new;
 
-out_free_list:
-	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
-		kfree(old);
-	}
 out_free_path:
 	btrfs_free_path(path);
 out_kfree:
-	kfree(new);
+	free_sa_defrag_extent(new);
 	return NULL;
 }
 
@@ -2708,8 +2711,14 @@ out:
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
 	/* for snapshot-aware defrag */
-	if (new)
-		relink_file_extents(new);
+	if (new) {
+		if (ret) {
+			free_sa_defrag_extent(new);
+			atomic_dec(&root->fs_info->defrag_running);
+		} else {
+			relink_file_extents(new);
+		}
+	}
 
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);

From 48ec47364b6d493f0a9cdc116977bf3f34e5c3ec Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 30 Oct 2013 13:25:24 +0800
Subject: [PATCH 80/98] Btrfs: fix a crash when running balance and defrag
 concurrently

Running balance and defrag concurrently can end up with a crash:

kernel BUG at fs/btrfs/relocation.c:4528!
RIP: 0010:[<ffffffffa01ac33b>]  [<ffffffffa01ac33b>] btrfs_reloc_cow_block+ 0x1eb/0x230 [btrfs]
Call Trace:
  [<ffffffffa01398c1>] ? update_ref_for_cow+0x241/0x380 [btrfs]
  [<ffffffffa0180bad>] ? copy_extent_buffer+0xad/0x110 [btrfs]
  [<ffffffffa0139da1>] __btrfs_cow_block+0x3a1/0x520 [btrfs]
  [<ffffffffa013a0b6>] btrfs_cow_block+0x116/0x1b0 [btrfs]
  [<ffffffffa013ddad>] btrfs_search_slot+0x43d/0x970 [btrfs]
  [<ffffffffa0153c57>] btrfs_lookup_file_extent+0x37/0x40 [btrfs]
  [<ffffffffa0172a5e>] __btrfs_drop_extents+0x11e/0xae0 [btrfs]
  [<ffffffffa013b3fd>] ? generic_bin_search.constprop.39+0x8d/0x1a0 [btrfs]
  [<ffffffff8117d14a>] ? kmem_cache_alloc+0x1da/0x200
  [<ffffffffa0138e7a>] ? btrfs_alloc_path+0x1a/0x20 [btrfs]
  [<ffffffffa0173ef0>] btrfs_drop_extents+0x60/0x90 [btrfs]
  [<ffffffffa016b24d>] relink_extent_backref+0x2ed/0x780 [btrfs]
  [<ffffffffa0162fe0>] ? btrfs_submit_bio_hook+0x1e0/0x1e0 [btrfs]
  [<ffffffffa01b8ed7>] ? iterate_inodes_from_logical+0x87/0xa0 [btrfs]
  [<ffffffffa016b909>] btrfs_finish_ordered_io+0x229/0xac0 [btrfs]
  [<ffffffffa016c3b5>] finish_ordered_fn+0x15/0x20 [btrfs]
  [<ffffffffa018cbe5>] worker_loop+0x125/0x4e0 [btrfs]
  [<ffffffffa018cac0>] ? btrfs_queue_worker+0x300/0x300 [btrfs]
  [<ffffffff81075ea0>] kthread+0xc0/0xd0
  [<ffffffff81075de0>] ? insert_kthread_work+0x40/0x40
  [<ffffffff8164796c>] ret_from_fork+0x7c/0xb0
  [<ffffffff81075de0>] ? insert_kthread_work+0x40/0x40
----------------------------------------------------------------------

It turns out to be that balance operation will bump root's @last_snapshot,
which enables snapshot-aware defrag path, and backref walking stuff will
find data reloc tree as refs' parent, and hit the BUG_ON() during COW.

As data reloc tree's data is just for relocation purpose, and will be deleted right
after relocation is done, it's unnecessary to walk those refs belonged to data reloc
tree, it'd be better to skip them.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 721936a5b2ca..30d24cfb45d4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -185,6 +185,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 {
 	struct __prelim_ref *ref;
 
+	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+		return 0;
+
 	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
 	if (!ref)
 		return -ENOMEM;

From 2e9f5954978cd5b0c26e6eeb9fd4ccbdce0f5ecc Mon Sep 17 00:00:00 2001
From: Rashika <rashika.kheria@gmail.com>
Date: Thu, 31 Oct 2013 02:45:20 +0530
Subject: [PATCH 81/98] btrfs: Add helper function for free_root_pointers()

The function free_root_pointers() in disk-io.h contains redundant code.
Therefore, this patch adds a helper function free_root_extent_buffers()
to free_root_pointers() to eliminate redundancy.

Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 58 ++++++++++++++--------------------------------
 1 file changed, 18 insertions(+), 40 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a3ad3a27944d..22443fac9408 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2033,50 +2033,28 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
 }
 
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+	if (root) {
+		free_extent_buffer(root->node);
+		free_extent_buffer(root->commit_root);
+		root->node = NULL;
+		root->commit_root = NULL;
+	}
+}
+
 /* helper to cleanup tree roots */
 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 {
-	free_extent_buffer(info->tree_root->node);
-	free_extent_buffer(info->tree_root->commit_root);
-	info->tree_root->node = NULL;
-	info->tree_root->commit_root = NULL;
+	free_root_extent_buffers(info->tree_root);
 
-	if (info->dev_root) {
-		free_extent_buffer(info->dev_root->node);
-		free_extent_buffer(info->dev_root->commit_root);
-		info->dev_root->node = NULL;
-		info->dev_root->commit_root = NULL;
-	}
-	if (info->extent_root) {
-		free_extent_buffer(info->extent_root->node);
-		free_extent_buffer(info->extent_root->commit_root);
-		info->extent_root->node = NULL;
-		info->extent_root->commit_root = NULL;
-	}
-	if (info->csum_root) {
-		free_extent_buffer(info->csum_root->node);
-		free_extent_buffer(info->csum_root->commit_root);
-		info->csum_root->node = NULL;
-		info->csum_root->commit_root = NULL;
-	}
-	if (info->quota_root) {
-		free_extent_buffer(info->quota_root->node);
-		free_extent_buffer(info->quota_root->commit_root);
-		info->quota_root->node = NULL;
-		info->quota_root->commit_root = NULL;
-	}
-	if (info->uuid_root) {
-		free_extent_buffer(info->uuid_root->node);
-		free_extent_buffer(info->uuid_root->commit_root);
-		info->uuid_root->node = NULL;
-		info->uuid_root->commit_root = NULL;
-	}
-	if (chunk_root) {
-		free_extent_buffer(info->chunk_root->node);
-		free_extent_buffer(info->chunk_root->commit_root);
-		info->chunk_root->node = NULL;
-		info->chunk_root->commit_root = NULL;
-	}
+	free_root_extent_buffers(info->dev_root);
+	free_root_extent_buffers(info->extent_root);
+	free_root_extent_buffers(info->csum_root);
+	free_root_extent_buffers(info->quota_root);
+	free_root_extent_buffers(info->uuid_root);
+	if (chunk_root)
+		free_root_extent_buffers(info->chunk_root);
 }
 
 static void del_fs_roots(struct btrfs_fs_info *fs_info)

From 95e94d14b42c1992493ce72492351d4601b274fe Mon Sep 17 00:00:00 2001
From: Rashika <rashika.kheria@gmail.com>
Date: Thu, 31 Oct 2013 03:12:42 +0530
Subject: [PATCH 82/98] btrfs: Replace multiple atomic_inc() with atomic_add()

This patch replaces multiple atomic_inc() with atomic_add() in
delayed-inode.c to reduce source code and have few instructions
for compilation.

Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/delayed-inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index df1a496bc8d4..af2ecca8a109 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -108,8 +108,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
 			return node;
 		}
 		btrfs_inode->delayed_node = node;
-		atomic_inc(&node->refs);	/* can be accessed */
-		atomic_inc(&node->refs);	/* cached in the inode */
+		/* can be accessed and cached in the inode */
+		atomic_add(2, &node->refs);
 		spin_unlock(&root->inode_lock);
 		return node;
 	}
@@ -138,8 +138,8 @@ again:
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
 
-	atomic_inc(&node->refs);	/* cached in the btrfs inode */
-	atomic_inc(&node->refs);	/* can be accessed */
+	/* cached in the btrfs inode and can be accessed */
+	atomic_add(2, &node->refs);
 
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret) {

From 3c45bfc1528ccb8b94c06d5740854bb75f8a92fb Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 09:57:33 +0530
Subject: [PATCH 83/98] btrfs: Pack struct btrfs_device

Pack the structure btrfs_device in volumes.h to eliminate holes detected
by pahole, thus reducing binary memory footprint.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/volumes.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 49e0ff01143f..8b3cd142b373 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -43,9 +43,8 @@ struct btrfs_device {
 	/* WRITE_SYNC bios */
 	struct btrfs_pending_bios pending_sync_bios;
 
-	int running_pending;
 	u64 generation;
-
+	int running_pending;
 	int writeable;
 	int in_fs_metadata;
 	int missing;
@@ -53,11 +52,11 @@ struct btrfs_device {
 	int is_tgtdev_for_dev_replace;
 
 	spinlock_t io_lock;
+	/* the mode sent to blkdev_get */
+	fmode_t mode;
 
 	struct block_device *bdev;
 
-	/* the mode sent to blkdev_get */
-	fmode_t mode;
 
 	struct rcu_string *name;
 
@@ -78,16 +77,21 @@ struct btrfs_device {
 
 	/* optimal io width for this device */
 	u32 io_width;
+	/* type and info about this device */
+	u64 type;
 
 	/* minimal io size for this device */
 	u32 sector_size;
 
-	/* type and info about this device */
-	u64 type;
 
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_UUID_SIZE];
 
+	/* for sending down flush barriers */
+	int nobarriers;
+	struct bio *flush_bio;
+	struct completion flush_wait;
+
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_device;
 
@@ -103,10 +107,6 @@ struct btrfs_device {
 	struct radix_tree_root reada_zones;
 	struct radix_tree_root reada_extents;
 
-	/* for sending down flush barriers */
-	struct bio *flush_bio;
-	struct completion flush_wait;
-	int nobarriers;
 
 	/* disk I/O failure stats. For detailed description refer to
 	 * enum btrfs_dev_stat_values in ioctl.h */

From b19e684393752ec129a36924e6c270daf79676d6 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:29:38 +0530
Subject: [PATCH 84/98] btrfs: Remove redundant local zero structure

Remove redundant local zero structure, replacing it by the kernel's
global ZERO_PAGE.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ioctl.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9ea05506cb84..78d20482b72a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -368,9 +368,8 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
 int btrfs_is_empty_uuid(u8 *uuid)
 {
-	static char empty_uuid[BTRFS_UUID_SIZE] = {0};
-
-	return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE);
+	BUILD_BUG_ON(BTRFS_UUID_SIZE > PAGE_SIZE);
+	return !memcmp(uuid, empty_zero_page, BTRFS_UUID_SIZE);
 }
 
 static noinline int create_subvol(struct inode *dir,

From fae7f21cece9a4c181a8d8131870c7247e153f65 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:30:08 +0530
Subject: [PATCH 85/98] btrfs: Use WARN_ON()'s return value in place of
 WARN_ON(1)

Use WARN_ON()'s return value in place of WARN_ON(1) for cleaner source
code that outputs a more descriptive warnings. Also fix the styling
warning of redundant braces that came up as a result of this fix.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/backref.c         |  3 +--
 fs/btrfs/check-integrity.c |  7 ++-----
 fs/btrfs/ctree.c           |  6 ++----
 fs/btrfs/delayed-inode.c   |  3 +--
 fs/btrfs/disk-io.c         |  8 +-------
 fs/btrfs/extent-tree.c     | 11 ++++-------
 fs/btrfs/extent_io.c       |  7 ++-----
 fs/btrfs/file-item.c       |  4 +---
 fs/btrfs/inode-map.c       |  4 +---
 fs/btrfs/inode.c           | 10 +++-------
 fs/btrfs/relocation.c      | 13 ++++---------
 fs/btrfs/send.c            |  4 +---
 fs/btrfs/tree-log.c        |  7 ++-----
 fs/btrfs/volumes.c         | 11 +++--------
 14 files changed, 28 insertions(+), 70 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 30d24cfb45d4..3775947429b2 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -326,8 +326,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
 	eb = path->nodes[level];
 	while (!eb) {
-		if (!level) {
-			WARN_ON(1);
+		if (WARN_ON(!level)) {
 			ret = 1;
 			goto out;
 		}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 656b07637e39..e0aab4456974 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2464,10 +2464,8 @@ static int btrfsic_process_written_superblock(
 		}
 	}
 
-	if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
-		WARN_ON(1);
+	if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
 		btrfsic_dump_tree(state);
-	}
 
 	return 0;
 }
@@ -2907,7 +2905,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 		btrfsic_release_block_ctx(&block_ctx);
 	}
 
-	if (!match) {
+	if (WARN_ON(!match)) {
 		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
 		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
 		       " phys_bytenr=%llu)!\n",
@@ -2924,7 +2922,6 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 			       bytenr, block_ctx.dev->name,
 			       block_ctx.dev_bytenr, mirror_num);
 		}
-		WARN_ON(1);
 	}
 }
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a749121203df..03c606cefd50 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1285,11 +1285,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		free_extent_buffer(eb_root);
 		blocksize = btrfs_level_size(root, old_root->level);
 		old = read_tree_block(root, logical, blocksize, 0);
-		if (!old || !extent_buffer_uptodate(old)) {
+		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
 			free_extent_buffer(old);
 			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
 				logical);
-			WARN_ON(1);
 		} else {
 			eb = btrfs_clone_extent_buffer(old);
 			free_extent_buffer(old);
@@ -3643,8 +3642,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 		ret = 1;
 		goto out;
 	}
-	if (!empty && push_items == btrfs_header_nritems(right))
-		WARN_ON(1);
+	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
 
 	/* push data from right to left */
 	copy_extent_buffer(left, right,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index af2ecca8a109..2bb8e1a988a1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -649,14 +649,13 @@ static int btrfs_delayed_inode_reserve_metadata(
 			goto out;
 
 		ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-		if (!ret)
+		if (!WARN_ON(ret))
 			goto out;
 
 		/*
 		 * Ok this is a problem, let's just steal from the global rsv
 		 * since this really shouldn't happen that often.
 		 */
-		WARN_ON(1);
 		ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
 					      dst_rsv, num_bytes);
 		goto out;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 22443fac9408..77fa2d99252f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -475,14 +475,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	if (page != eb->pages[0])
 		return 0;
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != start) {
-		WARN_ON(1);
+	if (WARN_ON(found_start != start || !PageUptodate(page)))
 		return 0;
-	}
-	if (!PageUptodate(page)) {
-		WARN_ON(1);
-		return 0;
-	}
 	csum_tree_block(root, eb, 0);
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3aa52701d04a..2d58461bd035 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1550,9 +1550,8 @@ again:
 	if (ret && !insert) {
 		err = -ENOENT;
 		goto out;
-	} else if (ret) {
+	} else if (WARN_ON(ret)) {
 		err = -EIO;
-		WARN_ON(1);
 		goto out;
 	}
 
@@ -5752,9 +5751,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 			extent_slot = path->slots[0];
 		}
-	} else if (ret == -ENOENT) {
+	} else if (WARN_ON(ret == -ENOENT)) {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
-		WARN_ON(1);
 		btrfs_err(info,
 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
 			bytenr, parent, root_objectid, owner_objectid,
@@ -8317,10 +8315,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 					struct btrfs_space_info,
 					list);
 		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
-			if (space_info->bytes_pinned > 0 ||
+			if (WARN_ON(space_info->bytes_pinned > 0 ||
 			    space_info->bytes_reserved > 0 ||
-			    space_info->bytes_may_use > 0) {
-				WARN_ON(1);
+			    space_info->bytes_may_use > 0)) {
 				dump_space_info(space_info, 0, 0);
 			}
 		}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a93bab452183..fb782ed62426 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1739,10 +1739,8 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	u64 last = 0;
 	int found = 0;
 
-	if (search_end <= cur_start) {
-		WARN_ON(1);
+	if (WARN_ON(search_end <= cur_start))
 		return 0;
-	}
 
 	spin_lock(&tree->lock);
 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
@@ -3568,9 +3566,8 @@ retry:
 			 * but no sense in crashing the users box for something
 			 * we can survive anyway.
 			 */
-			if (!eb) {
+			if (WARN_ON(!eb)) {
 				spin_unlock(&mapping->private_lock);
-				WARN_ON(1);
 				continue;
 			}
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ae8a5139e0ee..6f3848860283 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -849,10 +849,8 @@ insert:
 	path->leave_spinning = 0;
 	if (ret < 0)
 		goto fail_unlock;
-	if (ret != 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret != 0))
 		goto fail_unlock;
-	}
 	leaf = path->nodes[0];
 csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ec08004a6a20..ab485e57b6fe 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -78,10 +78,8 @@ again:
 			    btrfs_transaction_in_commit(fs_info)) {
 				leaf = path->nodes[0];
 
-				if (btrfs_header_nritems(leaf) == 0) {
-					WARN_ON(1);
+				if (WARN_ON(btrfs_header_nritems(leaf) == 0))
 					break;
-				}
 
 				/*
 				 * Save the key so we can advances forward
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17800221978f..ddecc9c332f4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2039,10 +2039,8 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
 		key.offset = offset;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret < 0))
 		return ret;
-	}
 	ret = 0;
 
 	while (1) {
@@ -3182,8 +3180,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
-			if (!S_ISREG(inode->i_mode)) {
-				WARN_ON(1);
+			if (WARN_ON(!S_ISREG(inode->i_mode))) {
 				iput(inode);
 				continue;
 			}
@@ -8023,8 +8020,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (ret == -EEXIST) {
 			/* we shouldn't get
 			 * eexist without a new_inode */
-			if (!new_inode) {
-				WARN_ON(1);
+			if (WARN_ON(!new_inode)) {
 				return ret;
 			}
 		} else {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a5f6a8061b21..70eca79bae52 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1777,8 +1777,7 @@ again:
 			new_ptr_gen = 0;
 		}
 
-		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
-			WARN_ON(1);
+		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
 			ret = level;
 			break;
 		}
@@ -3545,10 +3544,8 @@ static int find_data_references(struct reloc_control *rc,
 				err = ret;
 				goto out;
 			}
-			if (ret > 0) {
-				WARN_ON(1);
+			if (WARN_ON(ret > 0))
 				goto out;
-			}
 
 			leaf = path->nodes[0];
 			nritems = btrfs_header_nritems(leaf);
@@ -3568,11 +3565,9 @@ static int find_data_references(struct reloc_control *rc,
 		}
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid != ref_objectid ||
-		    key.type != BTRFS_EXTENT_DATA_KEY) {
-			WARN_ON(1);
+		if (WARN_ON(key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY))
 			break;
-		}
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e26a3a62fd3f..ec2f02435287 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -564,10 +564,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
 {
 	struct btrfs_cmd_header *hdr;
 
-	if (!sctx->send_buf) {
-		WARN_ON(1);
+	if (WARN_ON(!sctx->send_buf))
 		return -EINVAL;
-	}
 
 	BUG_ON(sctx->send_size);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7927a5fa755c..744553c83fe2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2130,8 +2130,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
 
-		if (btrfs_header_level(cur) != *level)
-			WARN_ON(1);
+		WARN_ON(btrfs_header_level(cur) != *level);
 
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(cur))
@@ -2951,10 +2950,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 
 	/* find the first key from this transaction again */
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (ret != 0) {
-		WARN_ON(1);
+	if (WARN_ON(ret != 0))
 		goto done;
-	}
 
 	/*
 	 * we have a block from this transaction, log every item in it
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 716abee16a5b..3c16da51348f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2552,8 +2552,7 @@ again:
 		failed = 0;
 		retried = true;
 		goto again;
-	} else if (failed && retried) {
-		WARN_ON(1);
+	} else if (WARN_ON(failed && retried)) {
 		ret = -ENOSPC;
 	}
 error:
@@ -5402,10 +5401,8 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
 		.bi_rw = bio->bi_rw,
 	};
 
-	if (bio->bi_vcnt == 0) {
-		WARN_ON(1);
+	if (WARN_ON(bio->bi_vcnt == 0))
 		return 1;
-	}
 
 	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
 	if (bio_sectors(bio) > max_sectors)
@@ -5638,10 +5635,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	struct btrfs_device *dev;
 	u64 tmp;
 
-	if (!devid && !fs_info) {
-		WARN_ON(1);
+	if (WARN_ON(!devid && !fs_info))
 		return ERR_PTR(-EINVAL);
-	}
 
 	dev = __alloc_device();
 	if (IS_ERR(dev))

From e248e04e7798d234ddbeab6ee6acdec2476fe1a8 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:31:13 +0530
Subject: [PATCH 86/98] btrfs: Enclose macros with complex values within
 parenthesis

Enclose macros with complex values within parenthesis in accordance to
checkpatch.pl.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent_map.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 61adc44b7805..93fba716d7f8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,10 +3,10 @@
 
 #include <linux/rbtree.h>
 
-#define EXTENT_MAP_LAST_BYTE (u64)-4
-#define EXTENT_MAP_HOLE (u64)-3
-#define EXTENT_MAP_INLINE (u64)-2
-#define EXTENT_MAP_DELALLOC (u64)-1
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */

From d9b0d9ba04cf99abff9125b688c03e154598a644 Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:32:18 +0530
Subject: [PATCH 87/98] btrfs: Replace kmalloc with kmalloc_array

Replace kmalloc(size * nr, ) with kmalloc_array(nr, size), thus making
it easier to check is that the calculation doesn't wrap or return a smaller allocation

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/delayed-inode.c | 4 ++--
 fs/btrfs/ioctl.c         | 2 +-
 fs/btrfs/super.c         | 2 +-
 fs/btrfs/volumes.c       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 2bb8e1a988a1..8d292fbae659 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -770,13 +770,13 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
 	 */
 	btrfs_set_path_blocking(path);
 
-	keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+	keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
 	if (!keys) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+	data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
 	if (!data_size) {
 		ret = -ENOMEM;
 		goto error;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 78d20482b72a..3712ef86ca82 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1198,7 +1198,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		ra = &file->f_ra;
 	}
 
-	pages = kmalloc(sizeof(struct page *) * max_cluster,
+	pages = kmalloc_array(max_cluster, sizeof(struct page *),
 			GFP_NOFS);
 	if (!pages) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c9079db7441e..1de6d4d67431 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1470,7 +1470,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 	nr_devices = fs_info->fs_devices->open_devices;
 	BUG_ON(!nr_devices);
 
-	devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3c16da51348f..0db637097862 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4901,7 +4901,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			num_stripes = map->num_stripes;
 			max_errors = nr_parity_stripes(map);
 
-			raid_map = kmalloc(sizeof(u64) * num_stripes,
+			raid_map = kmalloc_array(num_stripes, sizeof(u64),
 					   GFP_NOFS);
 			if (!raid_map) {
 				ret = -ENOMEM;

From 678712545b62715a6c867471320ff5f60a521f3a Mon Sep 17 00:00:00 2001
From: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Date: Thu, 31 Oct 2013 10:33:04 +0530
Subject: [PATCH 88/98] btrfs: Fix checkpatch.pl warning of spacing issues

Fix spacing issues detected via checkpatch.pl in accordance with the
kernel style guidelines.

Signed-off-by: Dulshani Gunawardhana <dulshani.gunawardhana89@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/async-thread.c     |  2 +-
 fs/btrfs/compression.c      |  2 +-
 fs/btrfs/ctree.c            |  2 +-
 fs/btrfs/extent-tree.c      |  6 +++---
 fs/btrfs/extent_io.c        |  2 +-
 fs/btrfs/file.c             |  2 +-
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/inode.c            |  6 +++---
 fs/btrfs/ioctl.c            | 14 +++++++-------
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 08cc08f037a6..8aec751fa464 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -262,7 +262,7 @@ static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
 	struct btrfs_work *work = NULL;
 	struct list_head *cur = NULL;
 
-	if(!list_empty(prio_head))
+	if (!list_empty(prio_head))
 		cur = prio_head->next;
 
 	smp_mb();
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index df019c7f58b2..1499b27b4186 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -359,7 +359,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
-	if(!bio) {
+	if (!bio) {
 		kfree(cb);
 		return -ENOMEM;
 	}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 03c606cefd50..316136bd6dd7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4018,7 +4018,7 @@ again:
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 					if (data_size && !tried_avoid_double)
 						goto push_for_double;
-					split = 2 ;
+					split = 2;
 				}
 			}
 		}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2d58461bd035..fb5c76795eda 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5017,7 +5017,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
 	if (to_reserve)
-		trace_btrfs_space_reservation(root->fs_info,"delalloc",
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
 					      btrfs_ino(inode), to_reserve, 1);
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
@@ -8022,7 +8022,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
 
 	spin_lock(&sinfo->lock);
 
-	for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		if (!list_empty(&sinfo->block_groups[i]))
 			free_bytes += __btrfs_get_ro_block_group_free_space(
 						&sinfo->block_groups[i]);
@@ -8310,7 +8310,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
 	release_global_block_rsv(info);
 
-	while(!list_empty(&info->space_info)) {
+	while (!list_empty(&info->space_info)) {
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fb782ed62426..856bc2b2192c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4034,7 +4034,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
 	if (offset >= last)
 		return NULL;
 
-	while(1) {
+	while (1) {
 		len = last - offset;
 		if (len == 0)
 			break;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3a20a12513b2..82d0342763c5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -369,7 +369,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 	u64 root_objectid = 0;
 
 	atomic_inc(&fs_info->defrag_running);
-	while(1) {
+	while (1) {
 		/* Pause the auto defragger. */
 		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
 			     &fs_info->fs_state))
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d7c445c30a16..057be95b1e1e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2280,7 +2280,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		goto out;
 
 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
-	while(1) {
+	while (1) {
 		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
 			*max_extent_size = entry->bytes;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ddecc9c332f4..57fa19d69e90 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6262,7 +6262,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
 	/* adjust the range_start to make sure it doesn't
 	 * go backwards from the start they passed in
 	 */
-	range_start = max(start,range_start);
+	range_start = max(start, range_start);
 	found = found_end - range_start;
 
 	if (found > 0) {
@@ -7066,7 +7066,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 			}
 		} else {
 			submit_len += bvec->bv_len;
-			nr_pages ++;
+			nr_pages++;
 			bvec++;
 		}
 	}
@@ -8367,7 +8367,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	u64 index = 0 ;
+	u64 index = 0;
 	int name_len;
 	int datasize;
 	unsigned long ptr;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3712ef86ca82..6523108d2984 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -686,7 +686,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
  *     nfs_async_unlink().
  */
 
-static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
 {
 	int error;
 
@@ -856,7 +856,7 @@ static int find_new_extents(struct btrfs_root *root,
 
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_forward(root, &min_key, path, newer_than);
 		if (ret != 0)
 			goto none;
@@ -1918,7 +1918,7 @@ static noinline int search_ioctl(struct inode *inode,
 
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_forward(root, &key, path, sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
@@ -2004,7 +2004,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
@@ -2033,7 +2033,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		}
 
 		*(ptr + len) = '/';
-		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+		read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
 
 		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
 			break;
@@ -2044,7 +2044,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		dirid = key.objectid;
 	}
 	memmove(name, ptr, total_len);
-	name[total_len]='\0';
+	name[total_len] = '\0';
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -2130,7 +2130,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 	inode = dentry->d_inode;
 	dest = BTRFS_I(inode)->root;
-	if (!capable(CAP_SYS_ADMIN)){
+	if (!capable(CAP_SYS_ADMIN)) {
 		/*
 		 * Regular user.  Only allow this with a special mount
 		 * option, when the user has write+exec access to the

From f570e757b54ca6eceabc9b19a6342f14e836c196 Mon Sep 17 00:00:00 2001
From: Rashika <rashika.kheria@gmail.com>
Date: Thu, 31 Oct 2013 16:50:42 +0530
Subject: [PATCH 89/98] btrfs: Remove useless variable in write_ctree_super()

The function write_ctree_super() in disk-io.c uses variable ret to return
the result of function write_all_supers(). Since, this variable serves
no purpose, hence the patch removes it and returns the call of the
called function.

Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/disk-io.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 77fa2d99252f..4c4ed0bb3da1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3440,10 +3440,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors)
 {
-	int ret;
-
-	ret = write_all_supers(root, max_mirrors);
-	return ret;
+	return write_all_supers(root, max_mirrors);
 }
 
 /* Drop a fs root from the radix tree and free it. */

From 007d31f755294b9db69c3d18e90d6edae9f1604d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Thu, 31 Oct 2013 16:49:02 -0400
Subject: [PATCH 90/98] Btrfs: check file extent type before anything else

I hit this problem with my no holes patch and it made me realize what the
problem was for bz 60834.  If the first item in the leaf is an inline extent and
we try to read anything starting from disk_bytenr onward we will read off the
end of the leaf.  So we need to check to see what it's type is, and if it's not
REG we can just break out.  This should fix this problem.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/send.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ec2f02435287..6837fe87f3a6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3902,16 +3902,16 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	while (key.offset < ekey->offset + left_len) {
 		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 		right_type = btrfs_file_extent_type(eb, ei);
-		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
-		right_len = btrfs_file_extent_num_bytes(eb, ei);
-		right_offset = btrfs_file_extent_offset(eb, ei);
-		right_gen = btrfs_file_extent_generation(eb, ei);
-
 		if (right_type != BTRFS_FILE_EXTENT_REG) {
 			ret = 0;
 			goto out;
 		}
 
+		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+		right_len = btrfs_file_extent_num_bytes(eb, ei);
+		right_offset = btrfs_file_extent_offset(eb, ei);
+		right_gen = btrfs_file_extent_generation(eb, ei);
+
 		/*
 		 * Are we at extent 8? If yes, we know the extent is changed.
 		 * This may only happen on the first iteration.

From 3b7a016f44d51ba8425c244f4c607f93fa213fd2 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 12 Oct 2013 02:11:12 +0800
Subject: [PATCH 91/98] Btrfs: avoid unnecessary scrub workers allocation

We only allocate scrub workers if we pass all the necessary
checks, for example, there are no operation in progress.

Besides, move mutex lock protection outside of scrub_workers_get()
/scrub_workers_put().

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/scrub.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 12009b4279ad..2544805544f0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2784,7 +2784,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 {
 	int ret = 0;
 
-	mutex_lock(&fs_info->scrub_lock);
 	if (fs_info->scrub_workers_refcnt == 0) {
 		if (is_dev_replace)
 			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
@@ -2814,21 +2813,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
-	mutex_unlock(&fs_info->scrub_lock);
-
 	return ret;
 }
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-	mutex_lock(&fs_info->scrub_lock);
 	if (--fs_info->scrub_workers_refcnt == 0) {
 		btrfs_stop_workers(&fs_info->scrub_workers);
 		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
 		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
-	mutex_unlock(&fs_info->scrub_lock);
 }
 
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -2889,23 +2884,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		return -EINVAL;
 	}
 
-	ret = scrub_workers_get(fs_info, is_dev_replace);
-	if (ret)
-		return ret;
 
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 	if (!dev || (dev->missing && !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
 		return -ENODEV;
 	}
-	mutex_lock(&fs_info->scrub_lock);
 
+	mutex_lock(&fs_info->scrub_lock);
 	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
 		return -EIO;
 	}
 
@@ -2916,10 +2906,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(fs_info);
 		return -EINPROGRESS;
 	}
 	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	ret = scrub_workers_get(fs_info, is_dev_replace);
+	if (ret) {
+		mutex_unlock(&fs_info->scrub_lock);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		return ret;
+	}
+
 	sctx = scrub_setup_ctx(dev, is_dev_replace);
 	if (IS_ERR(sctx)) {
 		mutex_unlock(&fs_info->scrub_lock);
@@ -2957,10 +2954,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_device = NULL;
+	scrub_workers_put(fs_info);
 	mutex_unlock(&fs_info->scrub_lock);
 
 	scrub_free_ctx(sctx);
-	scrub_workers_put(fs_info);
 
 	return ret;
 }

From d3ee29e39653455b7d799f77872d7df8c4060613 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:20 +0800
Subject: [PATCH 92/98] Btrfs: remove unnecessary initialization and memory
 barrior in shrink_delalloc()

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fb5c76795eda..5e0e6bcf67b9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4034,15 +4034,14 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	u64 delalloc_bytes;
 	u64 max_reclaim;
 	long time_left;
-	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
-	int loops = 0;
+	unsigned long nr_pages;
+	int loops;
 	enum btrfs_reserve_flush_enum flush;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
-	smp_mb();
 	delalloc_bytes = percpu_counter_sum_positive(
 						&root->fs_info->delalloc_bytes);
 	if (delalloc_bytes == 0) {
@@ -4052,6 +4051,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		return;
 	}
 
+	loops = 0;
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
@@ -4082,7 +4082,6 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 			if (time_left)
 				break;
 		}
-		smp_mb();
 		delalloc_bytes = percpu_counter_sum_positive(
 						&root->fs_info->delalloc_bytes);
 	}

From 38c135af8e22baf2ef3ebf9b213e398997a90946 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:21 +0800
Subject: [PATCH 93/98] Btrfs: wait for the ordered extent only when we want

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e0e6bcf67b9..2564cbfdf417 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4047,7 +4047,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	if (delalloc_bytes == 0) {
 		if (trans)
 			return;
-		btrfs_wait_all_ordered_extents(root->fs_info);
+		if (wait_ordered)
+			btrfs_wait_all_ordered_extents(root->fs_info);
 		return;
 	}
 

From 18cd8ea6df32e72548a4b055cba54d1a898c81e5 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:22 +0800
Subject: [PATCH 94/98] Btrfs: pick up the code for the item number calculation
 in flush_space()

This patch picked up the code that was used to calculate the number of
the items for which we need reserve space, and we will use it in the next
patch.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2564cbfdf417..9183f9cbc121 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4022,6 +4022,18 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 	}
 }
 
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+	u64 bytes;
+	int nr;
+
+	bytes = btrfs_calc_trans_metadata_size(root, 1);
+	nr = (int)div64_u64(to_reclaim, bytes);
+	if (!nr)
+		nr = 1;
+	return nr;
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -4167,16 +4179,11 @@ static int flush_space(struct btrfs_root *root,
 	switch (state) {
 	case FLUSH_DELAYED_ITEMS_NR:
 	case FLUSH_DELAYED_ITEMS:
-		if (state == FLUSH_DELAYED_ITEMS_NR) {
-			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
-
-			nr = (int)div64_u64(num_bytes, bytes);
-			if (!nr)
-				nr = 1;
-			nr *= 2;
-		} else {
+		if (state == FLUSH_DELAYED_ITEMS_NR)
+			nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+		else
 			nr = -1;
-		}
+
 		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);

From c61a16a701a12659e2933c5965afe8d45284146a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:23 +0800
Subject: [PATCH 95/98] Btrfs: fix the confusion between delalloc bytes and
 metadata bytes

In shrink_delalloc(), what we need reclaim is the metadata space, so
flushing pages by to_reclaim is not reasonable, it is very likely that
the pages we flush are not enough. And then we had to invoke the flush
function for several times, at the worst, we need call flush_space for
several times. It wasted time.

We improve this problem by converting the metadata space size we need
reserve to the delalloc bytes, By this way, we can flush the pages
by a reasonable number.

(Now we use a fixed number to do conversion, it is not flexible, maybe
 we can find a good way to improve it in the future.)

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9183f9cbc121..d8da538d01fb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4034,6 +4034,8 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
 	return nr;
 }
 
+#define EXTENT_SIZE_PER_ITEM	(256 * 1024)
+
 /*
  * shrink metadata reservation for delalloc
  */
@@ -4050,6 +4052,10 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	int loops;
 	enum btrfs_reserve_flush_enum flush;
 
+	/* Calc the number of the pages we need flush for space reservation */
+	to_reclaim = calc_reclaim_items_nr(root, to_reclaim);
+	to_reclaim *= EXTENT_SIZE_PER_ITEM;
+
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;

From 9f3a074d108810139ad4af49a29d347a4cf41e9a Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:24 +0800
Subject: [PATCH 96/98] Btrfs: don't wait for all the async delalloc when
 shrinking delalloc

It was very likely that there were lots of async delalloc pages in the
filesystem, if we waited until all the pages were flushed, we would be
blocked for a long time, and the performance would also drop down.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d8da538d01fb..83bffbea7d97 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4079,9 +4079,19 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		 * We need to wait for the async pages to actually start before
 		 * we do anything.
 		 */
-		wait_event(root->fs_info->async_submit_wait,
-			   !atomic_read(&root->fs_info->async_delalloc_pages));
+		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+		if (!max_reclaim)
+			goto skip_async;
 
+		if (max_reclaim <= nr_pages)
+			max_reclaim = 0;
+		else
+			max_reclaim -= nr_pages;
+
+		wait_event(root->fs_info->async_submit_wait,
+			   atomic_read(&root->fs_info->async_delalloc_pages) <=
+			   (int)max_reclaim);
+skip_async:
 		if (!trans)
 			flush = BTRFS_RESERVE_FLUSH_ALL;
 		else

From b02441999efcc6152b87cd58e7970bb7843f76cf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:25 +0800
Subject: [PATCH 97/98] Btrfs: don't wait for the completion of all the ordered
 extents

It is very likely that there are lots of ordered extents in the filesytem,
if we wait for the completion of all of them when we want to reclaim some
space for the metadata space reservation, we would be blocked for a long
time. The performance would drop down suddenly for a long time.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/dev-replace.c  |  4 ++--
 fs/btrfs/extent-tree.c  | 11 ++++++-----
 fs/btrfs/ioctl.c        |  2 +-
 fs/btrfs/ordered-data.c | 22 +++++++++++++++++-----
 fs/btrfs/ordered-data.h |  4 ++--
 fs/btrfs/relocation.c   |  2 +-
 fs/btrfs/super.c        |  2 +-
 fs/btrfs/transaction.c  |  2 +-
 8 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cb94310db072..3d2495e83e62 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -391,7 +391,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 	btrfs_dev_replace_unlock(dev_replace);
 
-	btrfs_wait_all_ordered_extents(root->fs_info);
+	btrfs_wait_ordered_roots(root->fs_info, -1);
 
 	/* force writing the updated state information to disk */
 	trans = btrfs_start_transaction(root, 0);
@@ -466,7 +466,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
 	}
-	btrfs_wait_all_ordered_extents(root->fs_info);
+	btrfs_wait_ordered_roots(root->fs_info, -1);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 83bffbea7d97..a21bbf83fdc2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4018,7 +4018,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 */
 		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
 		if (!current->journal_info)
-			btrfs_wait_all_ordered_extents(root->fs_info);
+			btrfs_wait_ordered_roots(root->fs_info, -1);
 	}
 }
 
@@ -4050,11 +4050,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	long time_left;
 	unsigned long nr_pages;
 	int loops;
+	int items;
 	enum btrfs_reserve_flush_enum flush;
 
 	/* Calc the number of the pages we need flush for space reservation */
-	to_reclaim = calc_reclaim_items_nr(root, to_reclaim);
-	to_reclaim *= EXTENT_SIZE_PER_ITEM;
+	items = calc_reclaim_items_nr(root, to_reclaim);
+	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4066,7 +4067,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 		if (trans)
 			return;
 		if (wait_ordered)
-			btrfs_wait_all_ordered_extents(root->fs_info);
+			btrfs_wait_ordered_roots(root->fs_info, items);
 		return;
 	}
 
@@ -4105,7 +4106,7 @@ skip_async:
 
 		loops++;
 		if (wait_ordered && !trans) {
-			btrfs_wait_all_ordered_extents(root->fs_info);
+			btrfs_wait_ordered_roots(root->fs_info, items);
 		} else {
 			time_left = schedule_timeout_killable(1);
 			if (time_left)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6523108d2984..d4f2861b0d9e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -572,7 +572,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (ret)
 		return ret;
 
-	btrfs_wait_ordered_extents(root);
+	btrfs_wait_ordered_extents(root, -1);
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8a5eff366596..25a8f3812f14 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -565,10 +565,11 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-void btrfs_wait_ordered_extents(struct btrfs_root *root)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 {
 	struct list_head splice, works;
 	struct btrfs_ordered_extent *ordered, *next;
+	int count = 0;
 
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
@@ -576,7 +577,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
 	spin_lock(&root->ordered_extent_lock);
 	list_splice_init(&root->ordered_extents, &splice);
-	while (!list_empty(&splice)) {
+	while (!list_empty(&splice) && nr) {
 		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
 					   root_extent_list);
 		list_move_tail(&ordered->root_extent_list,
@@ -591,7 +592,11 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 
 		cond_resched();
 		spin_lock(&root->ordered_extent_lock);
+		if (nr != -1)
+			nr--;
+		count++;
 	}
+	list_splice_tail(&splice, &root->ordered_extents);
 	spin_unlock(&root->ordered_extent_lock);
 
 	list_for_each_entry_safe(ordered, next, &works, work_list) {
@@ -601,18 +606,21 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
 		cond_resched();
 	}
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+	return count;
 }
 
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
+	int done;
 
 	INIT_LIST_HEAD(&splice);
 
 	spin_lock(&fs_info->ordered_root_lock);
 	list_splice_init(&fs_info->ordered_roots, &splice);
-	while (!list_empty(&splice)) {
+	while (!list_empty(&splice) && nr) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					ordered_root);
 		root = btrfs_grab_fs_root(root);
@@ -621,10 +629,14 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
 			       &fs_info->ordered_roots);
 		spin_unlock(&fs_info->ordered_root_lock);
 
-		btrfs_wait_ordered_extents(root);
+		done = btrfs_wait_ordered_extents(root, nr);
 		btrfs_put_fs_root(root);
 
 		spin_lock(&fs_info->ordered_root_lock);
+		if (nr != -1) {
+			nr -= done;
+			WARN_ON(nr < 0);
+		}
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3982db1c1b36..9b0450f7ac20 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -195,8 +195,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root);
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 70eca79bae52..e1b3c2c73c44 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4227,7 +4227,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 		err = ret;
 		goto out;
 	}
-	btrfs_wait_all_ordered_extents(fs_info);
+	btrfs_wait_ordered_roots(fs_info, -1);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1de6d4d67431..2d8ac1bf0cf9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -920,7 +920,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_wait_all_ordered_extents(fs_info);
+	btrfs_wait_ordered_roots(fs_info, -1);
 
 	trans = btrfs_attach_transaction_barrier(root);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 277fe812d047..32c100b8c563 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1636,7 +1636,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		btrfs_wait_all_ordered_extents(fs_info);
+		btrfs_wait_ordered_roots(fs_info, -1);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,

From 91aef86f3b8ab0685d930a5468254384513d1c97 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 4 Nov 2013 23:13:26 +0800
Subject: [PATCH 98/98] Btrfs: rename btrfs_start_all_delalloc_inodes

rename the function -- btrfs_start_all_delalloc_inodes(), and make its
name be compatible to btrfs_wait_ordered_roots(), since they are always
used at the same place.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 fs/btrfs/ctree.h       | 3 +--
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/inode.c       | 3 +--
 fs/btrfs/ioctl.c       | 2 +-
 fs/btrfs/relocation.c  | 2 +-
 fs/btrfs/transaction.c | 2 +-
 7 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 34279665fd69..aea4433081dc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3680,8 +3680,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-				    int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3d2495e83e62..342f9fd411e3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -461,7 +461,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+	ret = btrfs_start_delalloc_roots(root->fs_info, 0);
 	if (ret) {
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a21bbf83fdc2..45d98d01028f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4016,7 +4016,7 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 * the filesystem is readonly(all dirty pages are written to
 		 * the disk).
 		 */
-		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		btrfs_start_delalloc_roots(root->fs_info, 0);
 		if (!current->journal_info)
 			btrfs_wait_ordered_roots(root->fs_info, -1);
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 57fa19d69e90..da8d2f696ac5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8306,8 +8306,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return ret;
 }
 
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
-				    int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d4f2861b0d9e..1d04b5559e61 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4542,7 +4542,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
-		ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		ret = btrfs_start_delalloc_roots(root->fs_info, 0);
 		if (ret)
 			return ret;
 		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e1b3c2c73c44..ce459a7cb16d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4222,7 +4222,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
 	       rc->block_group->key.objectid, rc->block_group->flags);
 
-	ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
+	ret = btrfs_start_delalloc_roots(fs_info, 0);
 	if (ret < 0) {
 		err = ret;
 		goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 32c100b8c563..57c16b46afbd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1629,7 +1629,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		return btrfs_start_all_delalloc_inodes(fs_info, 1);
+		return btrfs_start_delalloc_roots(fs_info, 1);
 	return 0;
 }