bcachefs: Initial commit

Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write filesystem with every feature you could possibly want. Website: https://bcachefs.org Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-11-13 14:24:11 +08:00 · 2017-03-16 22:18:50 -08:00 · 2017-03-16 22:18:50 -08:00 · 1c6fdbd8f2
commit 1c6fdbd8f2
parent 0d29a833b7
122 changed files with 57147 additions and 0 deletions
--- a/fs/Kconfig
+++ b/fs/Kconfig
@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"

 endif # BLOCK
--- a/fs/Makefile
+++ b/fs/Makefile
@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@ -0,0 +1,52 @@
+
+config BCACHEFS_FS
+	tristate "bcachefs filesystem support"
+	depends on BLOCK
+	select EXPORTFS
+	select CLOSURES
+	select LIBCRC32C
+	select FS_POSIX_ACL
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	select CRYPTO_SHA256
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select KEYS
+	help
+	The bcachefs filesystem - a modern, copy on write filesystem, with
+	support for multiple devices, compression, checksumming, etc.
+
+config BCACHEFS_QUOTA
+	bool "bcachefs quota support"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+
+config BCACHEFS_POSIX_ACL
+	bool "bcachefs POSIX ACL support"
+	depends on BCACHEFS_FS
+	select FS_POSIX_ACL
+
+config BCACHEFS_DEBUG
+	bool "bcachefs debugging"
+	depends on BCACHEFS_FS
+	help
+	Enables many extra debugging checks and assertions.
+
+	The resulting code will be significantly slower than normal; you
+	probably shouldn't select this option unless you're a developer.
+
+config BCACHEFS_TESTS
+	bool "bcachefs unit and performance tests"
+	depends on BCACHEFS_FS
+	help
+	Include some unit and performance tests for the core btree code
+
+config BCACHEFS_NO_LATENCY_ACCT
+	bool "disable latency accounting and time stats"
+	depends on BCACHEFS_FS
+	help
+	This disables device latency tracking and time stats, only for performance testing
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@ -0,0 +1,53 @@
+
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
+
+bcachefs-y		:=	\
+	acl.o			\
+	alloc.o			\
+	bkey.o			\
+	bkey_methods.o		\
+	bset.o			\
+	btree_cache.o		\
+	btree_gc.o		\
+	btree_io.o		\
+	btree_iter.o		\
+	btree_update_interior.o	\
+	btree_update_leaf.o	\
+	buckets.o		\
+	chardev.o		\
+	checksum.o		\
+	clock.o			\
+	compress.o		\
+	debug.o			\
+	dirent.o		\
+	disk_groups.o		\
+	error.o			\
+	extents.o		\
+	fs.o			\
+	fs-ioctl.o		\
+	fs-io.o			\
+	fsck.o			\
+	inode.o			\
+	io.o			\
+	journal.o		\
+	journal_io.o		\
+	journal_reclaim.o	\
+	journal_seq_blacklist.o	\
+	keylist.o		\
+	migrate.o		\
+	move.o			\
+	movinggc.o		\
+	opts.o			\
+	quota.o			\
+	rebalance.o		\
+	recovery.o		\
+	replicas.o		\
+	siphash.o		\
+	six.o			\
+	super.o			\
+	super-io.o		\
+	sysfs.o			\
+	tests.o			\
+	trace.o			\
+	util.o			\
+	xattr.o
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "bcachefs.h"
+
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "acl.h"
+#include "fs.h"
+#include "xattr.h"
+
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+	return sizeof(bch_acl_header) +
+		sizeof(bch_acl_entry_short) * nr_short +
+		sizeof(bch_acl_entry) * nr_long;
+}
+
+static inline int acl_to_xattr_type(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+	struct posix_acl *acl;
+	struct posix_acl_entry *out;
+	unsigned count = 0;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		goto invalid;
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		goto invalid;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *entry = p;
+
+		if (p + sizeof(bch_acl_entry_short) > end)
+			goto invalid;
+
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			p += sizeof(bch_acl_entry);
+			break;
+		default:
+			goto invalid;
+		}
+
+		count++;
+	}
+
+	if (p > end)
+		goto invalid;
+
+	if (!count)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	out = acl->a_entries;
+
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+
+		out->e_tag  = le16_to_cpu(in->e_tag);
+		out->e_perm = le16_to_cpu(in->e_perm);
+
+		switch (out->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			out->e_uid = make_kuid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			out->e_gid = make_kgid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+
+		out++;
+	}
+
+	BUG_ON(out != acl->a_entries + acl->a_count);
+
+	return acl;
+invalid:
+	pr_err("invalid acl entry");
+	return ERR_PTR(-EINVAL);
+}
+
+#define acl_for_each_entry(acl, acl_e)			\
+	for (acl_e = acl->a_entries;			\
+	     acl_e < acl->a_entries + acl->a_count;	\
+	     acl_e++)
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+		  const struct posix_acl *acl,
+		  int type)
+{
+	struct bkey_i_xattr *xattr;
+	bch_acl_header *acl_header;
+	const struct posix_acl_entry *acl_e;
+	void *outptr;
+	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+
+	acl_for_each_entry(acl, acl_e) {
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			nr_long++;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			nr_short++;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	acl_len = bch2_acl_size(nr_short, nr_long);
+	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+
+	if (u64s > U8_MAX)
+		return ERR_PTR(-E2BIG);
+
+	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(xattr))
+		return xattr;
+
+	bkey_xattr_init(&xattr->k_i);
+	xattr->k.u64s		= u64s;
+	xattr->v.x_type		= acl_to_xattr_type(type);
+	xattr->v.x_name_len	= 0,
+	xattr->v.x_val_len	= cpu_to_le16(acl_len);
+
+	acl_header = xattr_val(&xattr->v);
+	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+
+	outptr = (void *) acl_header + sizeof(*acl_header);
+
+	acl_for_each_entry(acl, acl_e) {
+		bch_acl_entry *entry = outptr;
+
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			outptr += sizeof(bch_acl_entry_short);
+			break;
+		}
+	}
+
+	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+
+	return xattr;
+}
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct posix_acl *acl = NULL;
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(acl_to_xattr_type(type), "", 0),
+			0);
+	if (IS_ERR(iter)) {
+		if (PTR_ERR(iter) == -EINTR)
+			goto retry;
+
+		if (PTR_ERR(iter) != -ENOENT)
+			acl = ERR_CAST(iter);
+		goto out;
+	}
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+
+	if (!IS_ERR(acl))
+		set_cached_acl(&inode->v, type, acl);
+out:
+	bch2_trans_exit(&trans);
+	return acl;
+}
+
+int bch2_set_acl_trans(struct btree_trans *trans,
+		       struct bch_inode_unpacked *inode_u,
+		       const struct bch_hash_info *hash_info,
+		       struct posix_acl *acl, int type)
+{
+	int ret;
+
+	if (type == ACL_TYPE_DEFAULT &&
+	    !S_ISDIR(inode_u->bi_mode))
+		return acl ? -EACCES : 0;
+
+	if (acl) {
+		struct bkey_i_xattr *xattr =
+			bch2_acl_to_xattr(trans, acl, type);
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+
+		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+				      inode_u->bi_inum, &xattr->k_i, 0);
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(acl_to_xattr_type(type), "", 0);
+
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+				       inode_u->bi_inum, &search);
+	}
+
+	return ret == -ENOENT ? 0 : ret;
+}
+
+static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
+				       struct bch_inode_unpacked *bi,
+				       void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct timespec64 now = current_time(&inode->v);
+	umode_t mode = (unsigned long) p;
+
+	bi->bi_ctime	= timespec_to_bch2_time(c, now);
+	bi->bi_mode	= mode;
+	return 0;
+}
+
+int bch2_set_acl(struct mnt_idmap *idmap,
+		 struct dentry *dentry,
+		 struct posix_acl *acl, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	umode_t mode = inode->v.i_mode;
+	int ret;
+
+	if (type == ACL_TYPE_ACCESS && acl) {
+		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+		if (ret)
+			return ret;
+	}
+
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+
+	ret   = bch2_set_acl_trans(&trans,
+				   &inode->ei_inode,
+				   &inode->ei_str_hash,
+				   acl, type) ?:
+		bch2_write_inode_trans(&trans, inode, &inode_u,
+				       inode_update_for_set_acl_fn,
+				       (void *)(unsigned long) mode) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+
+	bch2_inode_update_after_write(c, inode, &inode_u,
+				      ATTR_CTIME|ATTR_MODE);
+
+	set_cached_acl(&inode->v, type, acl);
+err:
+	bch2_trans_exit(&trans);
+
+	return ret;
+}
+
+int bch2_acl_chmod(struct btree_trans *trans,
+		   struct bch_inode_info *inode,
+		   umode_t mode,
+		   struct posix_acl **new_acl)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_i_xattr *new;
+	struct posix_acl *acl;
+	int ret = 0;
+
+	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+			BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	if (IS_ERR_OR_NULL(acl))
+		return PTR_ERR(acl);
+
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	if (ret)
+		goto err;
+
+	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto err;
+	}
+
+	bch2_trans_update(trans, iter, &new->k_i, 0);
+	*new_acl = acl;
+	acl = NULL;
+err:
+	kfree(acl);
+	return ret;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#define BCH_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+
+int bch2_set_acl_trans(struct btree_trans *,
+		       struct bch_inode_unpacked *,
+		       const struct bch_hash_info *,
+		       struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+		   umode_t, struct posix_acl **);
+
+#else
+
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+				     struct bch_inode_unpacked *inode_u,
+				     const struct bch_hash_info *hash_info,
+				     struct posix_acl *acl, int type)
+{
+	return 0;
+}
+
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 umode_t mode,
+				 struct posix_acl **new_acl)
+{
+	return 0;
+}
+
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+
+#endif /* _BCACHEFS_ACL_H */
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
@ -0,0 +1,141 @@
+#ifndef _BCACHEFS_ALLOC_H
+#define _BCACHEFS_ALLOC_H
+
+#include "bcachefs.h"
+#include "alloc_types.h"
+
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_alloc_invalid,		\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
+					 struct write_point *,
+					 struct bch_devs_mask *);
+void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
+		     struct write_point *);
+
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS		= 0,
+	OPEN_BUCKETS_EMPTY	= -1,
+	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
+	NO_DEVICES		= -3,	/* -EROFS */
+};
+
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+		      struct closure *);
+
+#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
+	for ((_i) = (_start);						\
+	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
+	     (_i)++)
+
+#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
+	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
+
+#define writepoint_for_each_ptr(_wp, _ob, _i)				\
+	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+	unsigned i;
+
+	for (i = 0; i < *nr; i++)
+		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+	*nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					u8 *nr, u8 *refs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+
+	writepoint_for_each_ptr(wp, ob, i) {
+		atomic_inc(&ob->pin);
+		refs[(*nr)++] = ob - c->open_buckets;
+	}
+}
+
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+					     unsigned,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
+					     unsigned, unsigned,
+					     enum alloc_reserve,
+					     unsigned,
+					     struct closure *);
+
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i_extent *, unsigned);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+
+static inline void bch2_wake_allocator(struct bch_dev *ca)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(ca->alloc_thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+
+void bch2_recalc_capacity(struct bch_fs *);
+
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+
+void bch2_dev_allocator_stop(struct bch_dev *);
+int bch2_dev_allocator_start(struct bch_dev *);
+
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
+int bch2_alloc_write(struct bch_fs *);
+int bch2_fs_allocator_start(struct bch_fs *);
+void bch2_fs_allocator_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_ALLOC_H */
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+
+#include "clock_types.h"
+#include "fifo.h"
+
+/* There's two of these clocks, one for reads and one for writes: */
+struct bucket_clock {
+	/*
+	 * "now" in (read/write) IO time - incremented whenever we do X amount
+	 * of reads or writes.
+	 *
+	 * Goes with the bucket read/write prios: when we read or write to a
+	 * bucket we reset the bucket's prio to the current hand; thus hand -
+	 * prio = time since bucket was last read/written.
+	 *
+	 * The units are some amount (bytes/sectors) of data read/written, and
+	 * the units can change on the fly if we need to rescale to fit
+	 * everything in a u16 - your only guarantee is that the units are
+	 * consistent.
+	 */
+	u16			hand;
+	u16			max_last_io;
+
+	int			rw;
+
+	struct io_timer		rescale;
+	struct mutex		lock;
+};
+
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+	RESERVE_ALLOC		= -1,
+	RESERVE_BTREE		= 0,
+	RESERVE_MOVINGGC	= 1,
+	RESERVE_NONE		= 2,
+	RESERVE_NR		= 3,
+};
+
+typedef FIFO(long)	alloc_fifo;
+
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT	256
+#define WRITE_POINT_COUNT	32
+
+struct open_bucket {
+	spinlock_t		lock;
+	atomic_t		pin;
+	u8			freelist;
+	bool			valid;
+	bool			on_partial_list;
+	unsigned		sectors_free;
+	struct bch_extent_ptr	ptr;
+};
+
+struct write_point {
+	struct hlist_node	node;
+	struct mutex		lock;
+	u64			last_used;
+	unsigned long		write_point;
+	enum bch_data_type	type;
+
+	u8			nr_ptrs;
+	u8			first_ptr;
+
+	/* calculated based on how many pointers we're actually going to use: */
+	unsigned		sectors_free;
+
+	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
+struct write_point_specifier {
+	unsigned long		v;
+};
+
+struct alloc_heap_entry {
+	size_t			bucket;
+	size_t			nr;
+	unsigned long		key;
+};
+
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
+
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@ -0,0 +1,785 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_H
+#define _BCACHEFS_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+
+#include <linux/backing-dev-defs.h>
+#include <linux/bug.h>
+#include <linux/bio.h>
+#include <linux/closure.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/rhashtable.h>
+#include <linux/rwsem.h>
+#include <linux/seqlock.h>
+#include <linux/shrinker.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/zstd.h>
+
+#include "bcachefs_format.h"
+#include "fifo.h"
+#include "opts.h"
+#include "util.h"
+
+#define dynamic_fault(...)		0
+#define race_fault(...)			0
+
+#define bch2_fs_init_fault(name)						\
+	dynamic_fault("bcachefs:bch_fs_init:" name)
+#define bch2_meta_read_fault(name)					\
+	 dynamic_fault("bcachefs:meta:read:" name)
+#define bch2_meta_write_fault(name)					\
+	 dynamic_fault("bcachefs:meta:write:" name)
+
+#ifdef __KERNEL__
+#define bch2_fmt(_c, fmt)	"bcachefs (%s): " fmt "\n", ((_c)->name)
+#else
+#define bch2_fmt(_c, fmt)	fmt "\n"
+#endif
+
+#define bch_info(c, fmt, ...) \
+	printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_notice(c, fmt, ...) \
+	printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn(c, fmt, ...) \
+	printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err(c, fmt, ...) \
+	printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+
+#define bch_verbose(c, fmt, ...)					\
+do {									\
+	if ((c)->opts.verbose_recovery)					\
+		bch_info(c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+#define pr_verbose_init(opts, fmt, ...)					\
+do {									\
+	if (opt_get(opts, verbose_init))				\
+		pr_info(fmt, ##__VA_ARGS__);				\
+} while (0)
+
+/* Parameters that are useful for debugging, but should always be compiled in: */
+#define BCH_DEBUG_PARAMS_ALWAYS()					\
+	BCH_DEBUG_PARAM(key_merging_disabled,				\
+		"Disables merging of extents")				\
+	BCH_DEBUG_PARAM(btree_gc_always_rewrite,			\
+		"Causes mark and sweep to compact and rewrite every "	\
+		"btree node it traverses")				\
+	BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,			\
+		"Disables rewriting of btree nodes during mark and sweep")\
+	BCH_DEBUG_PARAM(btree_shrinker_disabled,			\
+		"Disables the shrinker callback for the btree node cache")
+
+/* Parameters that should only be compiled in in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG()					\
+	BCH_DEBUG_PARAM(expensive_debug_checks,				\
+		"Enables various runtime debugging checks that "	\
+		"significantly affect performance")			\
+	BCH_DEBUG_PARAM(debug_check_bkeys,				\
+		"Run bkey_debugcheck (primarily checking GC/allocation "\
+		"information) when iterating over keys")		\
+	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
+		"Reread btree nodes at various points to verify the "	\
+		"mergesort in the read path against modifications "	\
+		"done in memory")					\
+	BCH_DEBUG_PARAM(journal_seq_verify,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+	BCH_DEBUG_PARAM(inject_invalid_keys,				\
+		"Store the journal sequence number in the version "	\
+		"number of every btree key, and verify that btree "	\
+		"update ordering is preserved during recovery")		\
+
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
+#else
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
+#endif
+
+#define BCH_TIME_STATS()			\
+	x(btree_node_mem_alloc)			\
+	x(btree_gc)				\
+	x(btree_split)				\
+	x(btree_sort)				\
+	x(btree_read)				\
+	x(btree_lock_contended_read)		\
+	x(btree_lock_contended_intent)		\
+	x(btree_lock_contended_write)		\
+	x(data_write)				\
+	x(data_read)				\
+	x(data_promote)				\
+	x(journal_write)			\
+	x(journal_delay)			\
+	x(journal_blocked)			\
+	x(journal_flush_seq)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+	BCH_TIME_STATS()
+#undef x
+	BCH_TIME_STAT_NR
+};
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "clock_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "super_types.h"
+
+/* Number of nodes btree coalesce will try to coalesce at once */
+#define GC_MERGE_NODES		4U
+
+/* Maximum number of nodes we might need to allocate atomically: */
+#define BTREE_RESERVE_MAX	(BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
+
+/* Size of the freelist we allocate btree nodes from: */
+#define BTREE_NODE_RESERVE	(BTREE_RESERVE_MAX * 4)
+
+struct btree;
+
+enum gc_phase {
+	GC_PHASE_START,
+	GC_PHASE_SB,
+
+#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
+	DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+
+	GC_PHASE_PENDING_DELETE,
+	GC_PHASE_ALLOC,
+	GC_PHASE_DONE
+};
+
+struct gc_pos {
+	enum gc_phase		phase;
+	struct bpos		pos;
+	unsigned		level;
+};
+
+struct io_count {
+	u64			sectors[2][BCH_DATA_NR];
+};
+
+struct bch_dev {
+	struct kobject		kobj;
+	struct percpu_ref	ref;
+	struct completion	ref_completion;
+	struct percpu_ref	io_ref;
+	struct completion	io_ref_completion;
+
+	struct bch_fs		*fs;
+
+	u8			dev_idx;
+	/*
+	 * Cached version of this device's member info from superblock
+	 * Committed by bch2_write_super() -> bch_fs_mi_update()
+	 */
+	struct bch_member_cpu	mi;
+	__uuid_t		uuid;
+	char			name[BDEVNAME_SIZE];
+
+	struct bch_sb_handle	disk_sb;
+	int			sb_write_error;
+
+	struct bch_devs_mask	self;
+
+	/* biosets used in cloned bios for writing multiple replicas */
+	struct bio_set		replica_set;
+
+	/*
+	 * Buckets:
+	 * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+	 * gc_lock, for device resize - holding any is sufficient for access:
+	 * Or rcu_read_lock(), but only for ptr_stale():
+	 */
+	struct bucket_array __rcu *buckets;
+	unsigned long		*buckets_dirty;
+	/* most out of date gen in the btree */
+	u8			*oldest_gens;
+	struct rw_semaphore	bucket_lock;
+
+	struct bch_dev_usage __percpu *usage_percpu;
+	struct bch_dev_usage	usage_cached;
+
+	/* Allocator: */
+	struct task_struct __rcu *alloc_thread;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 */
+	alloc_fifo		free[RESERVE_NR];
+	alloc_fifo		free_inc;
+	spinlock_t		freelist_lock;
+	size_t			nr_invalidated;
+
+	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
+	unsigned		open_buckets_partial_nr;
+
+	size_t			fifo_last_bucket;
+
+	/* last calculated minimum prio */
+	u16			max_last_bucket_io[2];
+
+	atomic_long_t		saturated_count;
+	size_t			inc_gen_needs_gc;
+	size_t			inc_gen_really_needs_gc;
+	u64			allocator_journal_seq_flush;
+	bool			allocator_invalidating_data;
+	bool			allocator_blocked;
+
+	alloc_heap		alloc_heap;
+
+	/* Copying GC: */
+	struct task_struct	*copygc_thread;
+	copygc_heap		copygc_heap;
+	struct bch_pd_controller copygc_pd;
+	struct write_point	copygc_write_point;
+
+	atomic64_t		rebalance_work;
+
+	struct journal_device	journal;
+
+	struct work_struct	io_error_work;
+
+	/* The rest of this all shows up in sysfs */
+	atomic64_t		cur_latency[2];
+	struct bch2_time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;
+
+	struct io_count __percpu *io_done;
+};
+
+/*
+ * Flag bits for what phase of startup/shutdown the cache set is at, how we're
+ * shutting down, etc.:
+ *
+ * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ */
+enum {
+	/* startup: */
+	BCH_FS_ALLOC_READ_DONE,
+	BCH_FS_ALLOCATOR_STARTED,
+	BCH_FS_INITIAL_GC_DONE,
+	BCH_FS_FSCK_DONE,
+	BCH_FS_STARTED,
+
+	/* shutdown: */
+	BCH_FS_EMERGENCY_RO,
+	BCH_FS_WRITE_DISABLE_COMPLETE,
+
+	/* errors: */
+	BCH_FS_ERROR,
+	BCH_FS_GC_FAILURE,
+
+	/* misc: */
+	BCH_FS_BDEV_MOUNTED,
+	BCH_FS_FSCK_FIXED_ERRORS,
+	BCH_FS_FIXED_GENS,
+	BCH_FS_REBUILD_REPLICAS,
+	BCH_FS_HOLD_BTREE_WRITES,
+};
+
+struct btree_debug {
+	unsigned		id;
+	struct dentry		*btree;
+	struct dentry		*btree_format;
+	struct dentry		*failed;
+};
+
+enum bch_fs_state {
+	BCH_FS_STARTING		= 0,
+	BCH_FS_STOPPING,
+	BCH_FS_RO,
+	BCH_FS_RW,
+};
+
+struct bch_fs {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct kobject		opts_dir;
+	struct kobject		time_stats;
+	unsigned long		flags;
+
+	int			minor;
+	struct device		*chardev;
+	struct super_block	*vfs_sb;
+	char			name[40];
+
+	/* ro/rw, add/remove devices: */
+	struct mutex		state_lock;
+	enum bch_fs_state	state;
+
+	/* Counts outstanding writes, for clean transition to read-only */
+	struct percpu_ref	writes;
+	struct work_struct	read_only_work;
+
+	struct bch_dev __rcu	*devs[BCH_SB_MEMBERS_MAX];
+
+	struct bch_replicas_cpu __rcu *replicas;
+	struct bch_replicas_cpu __rcu *replicas_gc;
+	struct mutex		replicas_gc_lock;
+
+	struct bch_disk_groups_cpu __rcu *disk_groups;
+
+	struct bch_opts		opts;
+
+	/* Updated by bch2_sb_update():*/
+	struct {
+		__uuid_t	uuid;
+		__uuid_t	user_uuid;
+
+		u16		encoded_extent_max;
+
+		u8		nr_devices;
+		u8		clean;
+
+		u8		encryption_type;
+
+		u64		time_base_lo;
+		u32		time_base_hi;
+		u32		time_precision;
+		u64		features;
+	}			sb;
+
+	struct bch_sb_handle	disk_sb;
+
+	unsigned short		block_bits;	/* ilog2(block_size) */
+
+	u16			btree_foreground_merge_threshold;
+
+	struct closure		sb_write;
+	struct mutex		sb_lock;
+
+	/* BTREE CACHE */
+	struct bio_set		btree_bio;
+
+	struct btree_root	btree_roots[BTREE_ID_NR];
+	bool			btree_roots_dirty;
+	struct mutex		btree_root_lock;
+
+	struct btree_cache	btree_cache;
+
+	mempool_t		btree_reserve_pool;
+
+	/*
+	 * Cache of allocated btree nodes - if we allocate a btree node and
+	 * don't use it, if we free it that space can't be reused until going
+	 * _all_ the way through the allocator (which exposes us to a livelock
+	 * when allocating btree reserves fail halfway through) - instead, we
+	 * can stick them here:
+	 */
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	unsigned		btree_reserve_cache_nr;
+	struct mutex		btree_reserve_cache_lock;
+
+	mempool_t		btree_interior_update_pool;
+	struct list_head	btree_interior_update_list;
+	struct mutex		btree_interior_update_lock;
+	struct closure_waitlist	btree_interior_update_wait;
+
+	struct workqueue_struct	*wq;
+	/* copygc needs its own workqueue for index updates.. */
+	struct workqueue_struct	*copygc_wq;
+
+	/* ALLOCATION */
+	struct delayed_work	pd_controllers_update;
+	unsigned		pd_controllers_update_seconds;
+
+	struct bch_devs_mask	rw_devs[BCH_DATA_NR];
+
+	u64			capacity; /* sectors */
+
+	/*
+	 * When capacity _decreases_ (due to a disk being removed), we
+	 * increment capacity_gen - this invalidates outstanding reservations
+	 * and forces them to be revalidated
+	 */
+	u32			capacity_gen;
+
+	atomic64_t		sectors_available;
+
+	struct bch_fs_usage __percpu *usage_percpu;
+	struct bch_fs_usage	usage_cached;
+	struct percpu_rw_semaphore usage_lock;
+
+	struct closure_waitlist	freelist_wait;
+
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	struct bucket_clock	bucket_clock[2];
+
+	struct io_clock		io_clock[2];
+
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
+	u8			open_buckets_freelist;
+	u8			open_buckets_nr_free;
+	struct closure_waitlist	open_buckets_wait;
+	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
+
+	struct write_point	btree_write_point;
+	struct write_point	rebalance_write_point;
+
+	struct write_point	write_points[WRITE_POINT_COUNT];
+	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
+	struct mutex		write_points_hash_lock;
+
+	/* GARBAGE COLLECTION */
+	struct task_struct	*gc_thread;
+	atomic_t		kick_gc;
+	unsigned long		gc_count;
+
+	/*
+	 * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
+	 * has been marked by GC.
+	 *
+	 * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
+	 *
+	 * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
+	 * currently running, and gc marks are currently valid
+	 *
+	 * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
+	 * can read without a lock.
+	 */
+	seqcount_t		gc_pos_lock;
+	struct gc_pos		gc_pos;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress.
+	 */
+	struct rw_semaphore	gc_lock;
+
+	/* IO PATH */
+	struct bio_set		bio_read;
+	struct bio_set		bio_read_split;
+	struct bio_set		bio_write;
+	struct mutex		bio_bounce_pages_lock;
+	mempool_t		bio_bounce_pages;
+	struct rhashtable	promote_table;
+
+	mempool_t		compression_bounce[2];
+	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
+	mempool_t		decompress_workspace;
+	ZSTD_parameters		zstd_params;
+
+	struct crypto_shash	*sha256;
+	struct crypto_sync_skcipher *chacha20;
+	struct crypto_shash	*poly1305;
+
+	atomic64_t		key_version;
+
+	/* REBALANCE */
+	struct bch_fs_rebalance	rebalance;
+
+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+
+	struct bio_list		btree_write_error_list;
+	struct work_struct	btree_write_error_work;
+	spinlock_t		btree_write_error_lock;
+
+	/* ERRORS */
+	struct list_head	fsck_errors;
+	struct mutex		fsck_error_lock;
+	bool			fsck_alloc_err;
+
+	/* FILESYSTEM */
+	atomic_long_t		nr_inodes;
+
+	/* QUOTAS */
+	struct bch_memquota_type quotas[QTYP_NR];
+
+	/* DEBUG JUNK */
+	struct dentry		*debug;
+	struct btree_debug	btree_debug[BTREE_ID_NR];
+#ifdef CONFIG_BCACHEFS_DEBUG
+	struct btree		*verify_data;
+	struct btree_node	*verify_ondisk;
+	struct mutex		verify_lock;
+#endif
+
+	u64			unused_inode_hint;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - have to dynamically allocate them
+	 */
+	mempool_t		fill_iter;
+
+	mempool_t		btree_bounce_pool;
+
+	struct journal		journal;
+
+	unsigned		bucket_journal_seq;
+
+	/* The rest of this all shows up in sysfs */
+	atomic_long_t		read_realloc_races;
+	atomic_long_t		extent_migrate_done;
+	atomic_long_t		extent_migrate_raced;
+
+	unsigned		btree_gc_periodic:1;
+	unsigned		copy_gc_enabled:1;
+	bool			promote_whole_extents;
+
+#define BCH_DEBUG_PARAM(name, description) bool name;
+	BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
+};
+
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+	if (c->vfs_sb)
+		c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
+static inline bool bch2_fs_running(struct bch_fs *c)
+{
+	return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
+{
+	return ca->mi.bucket_size << 9;
+}
+
+static inline unsigned block_bytes(const struct bch_fs *c)
+{
+	return c->opts.block_size << 9;
+}
+
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+	return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+	s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+	if (c->sb.time_precision == 1)
+		return ns;
+
+	return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	return timespec_to_bch2_time(c, now);
+}
+
+#endif /* _BCACHEFS_H */
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)
+
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX			(1 << 4)
+
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV			(1 << 5)
+
+/* global control dev: */
+
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
+
+struct bch_ioctl_assemble {
+	__u32			flags;
+	__u32			nr_devs;
+	__u64			pad;
+	__u64			devs[];
+};
+
+struct bch_ioctl_incremental {
+	__u32			flags;
+	__u64			pad;
+	__u64			dev;
+};
+#endif
+
+/* filesystem ioctls: */
+
+#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+
+#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
+#define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
+#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	13,  struct bch_ioctl_disk_resize)
+
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+	__uuid_t		uuid;
+};
+
+#if 0
+struct bch_ioctl_start {
+	__u32			flags;
+	__u32			pad;
+};
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+
+struct bch_ioctl_disk {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+	__u32			flags;
+	__u8			new_state;
+	__u8			pad[3];
+	__u64			dev;
+};
+
+enum bch_data_ops {
+	BCH_DATA_OP_SCRUB	= 0,
+	BCH_DATA_OP_REREPLICATE	= 1,
+	BCH_DATA_OP_MIGRATE	= 2,
+	BCH_DATA_OP_NR		= 3,
+};
+
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+	__u32			op;
+	__u32			flags;
+
+	struct bpos		start;
+	struct bpos		end;
+
+	union {
+	struct {
+		__u32		dev;
+		__u32		pad;
+	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
+	};
+} __attribute__((packed, aligned(8)));
+
+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+
+struct bch_ioctl_data_progress {
+	__u8			data_type;
+	__u8			btree_id;
+	__u8			pad[2];
+	struct bpos		pos;
+
+	__u64			sectors_done;
+	__u64			sectors_total;
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_dev_usage {
+	__u8			state;
+	__u8			alive;
+	__u8			pad[6];
+	__u32			dev;
+
+	__u32			bucket_size;
+	__u64			nr_buckets;
+
+	__u64			buckets[BCH_DATA_NR];
+	__u64			sectors[BCH_DATA_NR];
+};
+
+struct bch_ioctl_fs_usage {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+	__u64			persistent_reserved[BCH_REPLICAS_MAX];
+	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+};
+
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices	- number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
+struct bch_ioctl_usage {
+	__u16			nr_devices;
+	__u16			pad[3];
+
+	struct bch_ioctl_fs_usage fs;
+	struct bch_ioctl_dev_usage devs[0];
+};
+
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			size;
+	__u64			sb;
+};
+
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+	__u64			dev;
+};
+
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+
+#endif /* _BCACHEFS_IOCTL_H */
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@ -0,0 +1,627 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_H
+#define _BCACHEFS_BKEY_H
+
+#include <linux/bug.h>
+#include "bcachefs_format.h"
+
+#include "util.h"
+#include "vstructs.h"
+
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
+#ifdef CONFIG_X86_64
+#define HAVE_BCACHEFS_COMPILED_UNPACK	1
+#endif
+#endif
+
+void bch2_to_binary(char *, const u64 *, unsigned);
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_next(_k)		vstruct_next(_k)
+
+static inline unsigned bkey_val_u64s(const struct bkey *k)
+{
+	return k->u64s - BKEY_U64s;
+}
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	k->u64s = BKEY_U64s + val_u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_DELETED)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
+
+#define bkey_packed_typecheck(_k)					\
+({									\
+	BUILD_BUG_ON(!type_is(_k, struct bkey *) &&			\
+		     !type_is(_k, struct bkey_packed *));		\
+	type_is(_k, struct bkey_packed *);				\
+})
+
+enum bkey_lr_packed {
+	BKEY_PACKED_BOTH,
+	BKEY_PACKED_RIGHT,
+	BKEY_PACKED_LEFT,
+	BKEY_PACKED_NONE,
+};
+
+#define bkey_lr_packed_typecheck(_l, _r)				\
+	(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
+
+#define bkey_lr_packed(_l, _r)						\
+	((_l)->format + ((_r)->format << 1))
+
+#define bkey_copy(_dst, _src)					\
+do {								\
+	BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&		\
+		     !type_is(_dst, struct bkey_packed *));	\
+	BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&		\
+		     !type_is(_src, struct bkey_packed *));	\
+	EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&		\
+		(u64 *) (_dst) < (u64 *) (_src) +		\
+		((struct bkey *) (_src))->u64s);		\
+								\
+	__memmove_u64s_down((_dst), (_src),			\
+			    ((struct bkey *) (_src))->u64s);	\
+} while (0)
+
+struct btree;
+
+struct bkey_format_state {
+	u64 field_min[BKEY_NR_FIELDS];
+	u64 field_max[BKEY_NR_FIELDS];
+};
+
+void bch2_bkey_format_init(struct bkey_format_state *);
+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
+const char *bch2_bkey_format_validate(struct bkey_format *);
+
+__pure
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bkey_packed *);
+__pure
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
+
+__pure
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
+				     const struct bkey_packed *,
+				     const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
+					  const struct bkey_packed *,
+					  const struct bpos *);
+
+__pure
+int __bch2_bkey_cmp_packed(const struct bkey_packed *,
+			   const struct bkey_packed *,
+			   const struct btree *);
+
+__pure
+int __bch2_bkey_cmp_left_packed(const struct btree *,
+				const struct bkey_packed *,
+				const struct bpos *);
+
+static inline __pure
+int bkey_cmp_left_packed(const struct btree *b,
+			 const struct bkey_packed *l, const struct bpos *r)
+{
+	return __bch2_bkey_cmp_left_packed(b, l, r);
+}
+
+/*
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
+ */
+__pure __flatten
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
+					     const struct bkey_packed *l,
+					     struct bpos r)
+{
+	return bkey_cmp_left_packed(b, l, &r);
+}
+
+/*
+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
+ * skip dispatching on k->format:
+ */
+#define bkey_cmp_packed(_b, _l, _r)					\
+({									\
+	int _cmp;							\
+									\
+	switch (bkey_lr_packed_typecheck(_l, _r)) {			\
+	case BKEY_PACKED_NONE:						\
+		_cmp = bkey_cmp(((struct bkey *) (_l))->p,		\
+				((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_LEFT:						\
+		_cmp = bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_l),		\
+				  &((struct bkey *) (_r))->p);		\
+		break;							\
+	case BKEY_PACKED_RIGHT:						\
+		_cmp = -bkey_cmp_left_packed((_b),			\
+				  (struct bkey_packed *) (_r),		\
+				  &((struct bkey *) (_l))->p);		\
+		break;							\
+	case BKEY_PACKED_BOTH:						\
+		_cmp = __bch2_bkey_cmp_packed((void *) (_l),		\
+					 (void *) (_r), (_b));		\
+		break;							\
+	}								\
+	_cmp;								\
+})
+
+#if 1
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
+{
+	if (l.inode != r.inode)
+		return l.inode < r.inode ? -1 : 1;
+	if (l.offset != r.offset)
+		return l.offset < r.offset ? -1 : 1;
+	if (l.snapshot != r.snapshot)
+		return l.snapshot < r.snapshot ? -1 : 1;
+	return 0;
+}
+#else
+int bkey_cmp(struct bpos l, struct bpos r);
+#endif
+
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
+{
+	return bkey_cmp(l, r) < 0 ? l : r;
+}
+
+void bch2_bpos_swab(struct bpos *);
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
+
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
+{
+	return  (l.hi > r.hi) - (l.hi < r.hi) ?:
+		(l.lo > r.lo) - (l.lo < r.lo);
+}
+
+#define ZERO_VERSION	((struct bversion) { .hi = 0, .lo = 0 })
+#define MAX_VERSION	((struct bversion) { .hi = ~0, .lo = ~0ULL })
+
+static __always_inline int bversion_zero(struct bversion v)
+{
+	return !bversion_cmp(v, ZERO_VERSION);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+/* statement expressions confusing unlikely()? */
+#define bkey_packed(_k)							\
+	({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT);			\
+	 (_k)->format != KEY_FORMAT_CURRENT; })
+#else
+#define bkey_packed(_k)		((_k)->format != KEY_FORMAT_CURRENT)
+#endif
+
+/*
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
+ */
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
+{
+	return (struct bkey_packed *) k;
+}
+
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
+{
+	return (const struct bkey_packed *) k;
+}
+
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (struct bkey_i *) k;
+}
+
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
+{
+	return bkey_packed(k) ? NULL : (const struct bkey *) k;
+}
+
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
+{
+	return format->bits_per_field[BKEY_FIELD_INODE] +
+		format->bits_per_field[BKEY_FIELD_OFFSET] +
+		format->bits_per_field[BKEY_FIELD_SNAPSHOT];
+}
+
+static inline struct bpos bkey_successor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!++ret.offset)
+		BUG_ON(!++ret.inode);
+
+	return ret;
+}
+
+static inline struct bpos bkey_predecessor(struct bpos p)
+{
+	struct bpos ret = p;
+
+	if (!ret.offset--)
+		BUG_ON(!ret.inode--);
+
+	return ret;
+}
+
+static inline u64 bkey_start_offset(const struct bkey *k)
+{
+	return k->p.offset - k->size;
+}
+
+static inline struct bpos bkey_start_pos(const struct bkey *k)
+{
+	return (struct bpos) {
+		.inode		= k->p.inode,
+		.offset		= bkey_start_offset(k),
+		.snapshot	= k->p.snapshot,
+	};
+}
+
+/* Packed helpers */
+
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+
+	EBUG_ON(k->u64s < ret);
+	return ret;
+}
+
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
+				       const struct bkey_packed *k)
+{
+	return bkeyp_key_u64s(format, k) * sizeof(u64);
+}
+
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
+				      const struct bkey_packed *k)
+{
+	return k->u64s - bkeyp_key_u64s(format, k);
+}
+
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
+				     const struct bkey_packed *k)
+{
+	return bkeyp_val_u64s(format, k) * sizeof(u64);
+}
+
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
+				      struct bkey_packed *k, unsigned val_u64s)
+{
+	k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
+}
+
+#define bkeyp_val(_format, _k)						\
+	 ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+
+extern const struct bkey_format bch2_bkey_format_current;
+
+bool bch2_bkey_transform(const struct bkey_format *,
+			 struct bkey_packed *,
+			 const struct bkey_format *,
+			 const struct bkey_packed *);
+
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
+				   const struct bkey_packed *);
+
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
+			      const struct bkey_packed *);
+#endif
+
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
+		   const struct bkey_format *);
+
+enum bkey_pack_pos_ret {
+	BKEY_PACK_POS_EXACT,
+	BKEY_PACK_POS_SMALLER,
+	BKEY_PACK_POS_FAIL,
+};
+
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
+					   const struct btree *);
+
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
+				 const struct btree *b)
+{
+	return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
+}
+
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
+		 const struct bkey_packed *);
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
+	       const struct bkey_format *);
+
+static inline u64 bkey_field_max(const struct bkey_format *f,
+				 enum bch_bkey_fields nr)
+{
+	return f->bits_per_field[nr] < 64
+		? (le64_to_cpu(f->field_offset[nr]) +
+		   ~(~0ULL << f->bits_per_field[nr]))
+		: U64_MAX;
+}
+
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
+
+#else
+
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
+					  void *out) { return 0; }
+
+#endif
+
+static inline void bkey_reassemble(struct bkey_i *dst,
+				   struct bkey_s_c src)
+{
+	BUG_ON(bkey_packed(src.k));
+	dst->k = *src.k;
+	memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+}
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define __BKEY_VAL_ACCESSORS(name, nr, _assert)				\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	_assert(k.k->type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	_assert(k->k.type, nr);						\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bch_##name *					\
+bkey_p_##name##_val(const struct bkey_format *f,			\
+		    struct bkey_packed *k)				\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline const struct bch_##name *					\
+bkey_p_c_##name##_val(const struct bkey_format *f,			\
+		      const struct bkey_packed *k)			\
+{									\
+	return container_of(bkeyp_val(f, k), struct bch_##name, v);	\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = nr;							\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+#define __BKEY_VAL_ASSERT(_type, _nr)	EBUG_ON(_type != _nr)
+
+#define BKEY_VAL_ACCESSORS(name, _nr)					\
+	static inline void __bch_##name##_assert(u8 type, u8 nr)	\
+	{								\
+		EBUG_ON(type != _nr);					\
+	}								\
+									\
+	__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
+
+BKEY_VAL_ACCESSORS(cookie,		KEY_TYPE_COOKIE);
+
+static inline void __bch2_extent_assert(u8 type, u8 nr)
+{
+	EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
+}
+
+__BKEY_VAL_ACCESSORS(extent,		BCH_EXTENT, __bch2_extent_assert);
+BKEY_VAL_ACCESSORS(reservation,		BCH_RESERVATION);
+
+BKEY_VAL_ACCESSORS(inode,		BCH_INODE_FS);
+BKEY_VAL_ACCESSORS(inode_blockdev,	BCH_INODE_BLOCKDEV);
+BKEY_VAL_ACCESSORS(inode_generation,	BCH_INODE_GENERATION);
+
+BKEY_VAL_ACCESSORS(dirent,		BCH_DIRENT);
+
+BKEY_VAL_ACCESSORS(xattr,		BCH_XATTR);
+
+BKEY_VAL_ACCESSORS(alloc,		BCH_ALLOC);
+
+BKEY_VAL_ACCESSORS(quota,		BCH_QUOTA);
+
+/* byte order helpers */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return f->key_u64s - 1;
+}
+
+#define high_bit_offset		0
+#define nth_word(p, n)		((p) - (n))
+
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+	return 0;
+}
+
+#define high_bit_offset		KEY_PACKED_BITS_START
+#define nth_word(p, n)		((p) + (n))
+
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define high_word(f, k)		((k)->_data + high_word_offset(f))
+#define next_word(p)		nth_word(p, 1)
+#define prev_word(p)		nth_word(p, -1)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_bkey_pack_test(void);
+#else
+static inline void bch2_bkey_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_BKEY_H */
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "alloc.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "quota.h"
+#include "xattr.h"
+
+const struct bkey_ops bch2_bkey_ops[] = {
+	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
+	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
+	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
+	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+				  struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+		return NULL;
+
+	case KEY_TYPE_ERROR:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	case KEY_TYPE_COOKIE:
+		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
+			? "incorrect value size"
+			: NULL;
+
+	default:
+		if (k.k->type < KEY_TYPE_GENERIC_NR)
+			return "invalid type";
+
+		return ops->key_invalid(c, k);
+	}
+}
+
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+
+	if (!ops->is_extents) {
+		if (k.k->size)
+			return "nonzero size field";
+	} else {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+	}
+
+	if (ops->is_extents &&
+	    !k.k->size &&
+	    !bkey_deleted(k.k))
+		return "zero size field";
+
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+
+	if (type != BKEY_TYPE_BTREE &&
+	    !bkey_cmp(k.k->p, POS_MAX))
+		return "POS_MAX key";
+
+	return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	return __bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+{
+	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+		return "key before start of btree node";
+
+	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+		return "key past end of btree node";
+
+	return NULL;
+}
+
+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+	enum bkey_type type = btree_node_type(b);
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const char *invalid;
+
+	BUG_ON(!k.k->u64s);
+
+	invalid = bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_in_btree_node(b, k);
+	if (invalid) {
+		char buf[160];
+
+		bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+		return;
+	}
+
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->key_debugcheck)
+		ops->key_debugcheck(c, b, k);
+}
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	char *out = buf, *end = buf + size;
+
+	p("u64s %u type %u ", k->u64s, k->type);
+
+	if (bkey_cmp(k->p, POS_MAX))
+		p("%llu:%llu", k->p.inode, k->p.offset);
+	else
+		p("POS_MAX");
+
+	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+	return out - buf;
+}
+
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+		     char *buf, size_t size, struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	char *out = buf, *end = buf + size;
+
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	default:
+		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+			ops->val_to_text(c, buf, size, k);
+		break;
+	}
+
+	return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+
+	out += bch2_bkey_to_text(out, end - out, k.k);
+	out += scnprintf(out, end - out, ": ");
+	out += bch2_val_to_text(c, type, out, end - out, k);
+
+	return out - buf;
+}
+
+void bch2_bkey_swab(enum bkey_type type,
+		   const struct bkey_format *f,
+		   struct bkey_packed *k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+
+	bch2_bkey_swab_key(f, k);
+
+	if (ops->swab)
+		ops->swab(f, k);
+}
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_METHODS_H
+#define _BCACHEFS_BKEY_METHODS_H
+
+#include "bkey.h"
+
+#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
+
+enum bkey_type {
+	DEFINE_BCH_BTREE_IDS()
+	BKEY_TYPE_BTREE,
+};
+
+#undef DEF_BTREE_ID
+
+/* Type of a key in btree @id at level @level: */
+static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
+{
+	return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
+}
+
+static inline bool btree_type_has_ptrs(enum bkey_type type)
+{
+	switch (type) {
+	case BKEY_TYPE_BTREE:
+	case BKEY_TYPE_EXTENTS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct bch_fs;
+struct btree;
+struct bkey;
+
+enum merge_result {
+	BCH_MERGE_NOMERGE,
+
+	/*
+	 * The keys were mergeable, but would have overflowed size - so instead
+	 * l was changed to the maximum size, and both keys were modified:
+	 */
+	BCH_MERGE_PARTIAL,
+	BCH_MERGE_MERGE,
+};
+
+typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
+			      struct bkey_s);
+typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
+					  struct btree *,
+					  struct bkey_i *, struct bkey_i *);
+
+struct bkey_ops {
+	/* Returns reason for being invalid if invalid, else NULL: */
+	const char *	(*key_invalid)(const struct bch_fs *,
+				       struct bkey_s_c);
+	void		(*key_debugcheck)(struct bch_fs *, struct btree *,
+					  struct bkey_s_c);
+	void		(*val_to_text)(struct bch_fs *, char *,
+				       size_t, struct bkey_s_c);
+	void		(*swab)(const struct bkey_format *, struct bkey_packed *);
+	key_filter_fn	key_normalize;
+	key_merge_fn	key_merge;
+	bool		is_extents;
+};
+
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+				  struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
+
+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+		     char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+			  char *, size_t, struct bkey_s_c);
+
+void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
+		    struct bkey_packed *);
+
+extern const struct bkey_ops bch2_bkey_ops[];
+
+#endif /* _BCACHEFS_BKEY_METHODS_H */
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@ -0,0 +1,668 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BSET_H
+#define _BCACHEFS_BSET_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "bcachefs_format.h"
+#include "bkey.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "util.h" /* for time_stats */
+#include "vstructs.h"
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bkey_invalid and
+ * bkey_deleted().
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
+enum bset_aux_tree_type {
+	BSET_NO_AUX_TREE,
+	BSET_RO_AUX_TREE,
+	BSET_RW_AUX_TREE,
+};
+
+#define BSET_TREE_NR_TYPES	3
+
+#define BSET_NO_AUX_TREE_VAL	(U16_MAX)
+#define BSET_RW_AUX_TREE_VAL	(U16_MAX - 1)
+
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
+{
+	switch (t->extra) {
+	case BSET_NO_AUX_TREE_VAL:
+		EBUG_ON(t->size);
+		return BSET_NO_AUX_TREE;
+	case BSET_RW_AUX_TREE_VAL:
+		EBUG_ON(!t->size);
+		return BSET_RW_AUX_TREE;
+	default:
+		EBUG_ON(!t->size);
+		return BSET_RO_AUX_TREE;
+	}
+}
+
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
+
+static inline void
+__bkey_unpack_key_format_checked(const struct btree *b,
+			       struct bkey *dst,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	{
+		compiled_unpack_fn unpack_fn = b->aux_data;
+		unpack_fn(dst, src);
+
+		if (btree_keys_expensive_checks(b)) {
+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
+
+			/*
+			 * hack around a harmless race when compacting whiteouts
+			 * for a write:
+			 */
+			dst2.needs_whiteout = dst->needs_whiteout;
+
+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
+		}
+	}
+#else
+	*dst = __bch2_bkey_unpack_key(&b->format, src);
+#endif
+}
+
+static inline struct bkey
+bkey_unpack_key_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+	struct bkey dst;
+
+	__bkey_unpack_key_format_checked(b, &dst, src);
+	return dst;
+}
+
+static inline void __bkey_unpack_key(const struct btree *b,
+				     struct bkey *dst,
+				     const struct bkey_packed *src)
+{
+	if (likely(bkey_packed(src)))
+		__bkey_unpack_key_format_checked(b, dst, src);
+	else
+		*dst = *packed_to_bkey_c(src);
+}
+
+/**
+ * bkey_unpack_key -- unpack just the key, not the value
+ */
+static inline struct bkey bkey_unpack_key(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_key_format_checked(b, src)
+		: *packed_to_bkey_c(src);
+}
+
+static inline struct bpos
+bkey_unpack_pos_format_checked(const struct btree *b,
+			       const struct bkey_packed *src)
+{
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
+	return bkey_unpack_key_format_checked(b, src).p;
+#else
+	return __bkey_unpack_pos(&b->format, src);
+#endif
+}
+
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
+					  const struct bkey_packed *src)
+{
+	return likely(bkey_packed(src))
+		? bkey_unpack_pos_format_checked(b, src)
+		: packed_to_bkey_c(src)->p;
+}
+
+/* Disassembled bkeys */
+
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
+					       const struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
+}
+
+/* non const version: */
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
+					       struct bkey_packed *k,
+					       struct bkey *u)
+{
+	__bkey_unpack_key(b, u, k);
+
+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
+}
+
+#define for_each_bset(_b, _t)					\
+	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
+
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
+}
+
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
+{
+	return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
+}
+
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
+					    struct bset_tree *t)
+{
+	BUG_ON(t < b->set);
+
+	for (; t < b->set + ARRAY_SIZE(b->set); t++) {
+		t->size = 0;
+		t->extra = BSET_NO_AUX_TREE_VAL;
+		t->aux_data_offset = U16_MAX;
+	}
+}
+
+static inline void btree_node_set_format(struct btree *b,
+					 struct bkey_format f)
+{
+	int len;
+
+	b->format	= f;
+	b->nr_key_bits	= bkey_format_key_bits(&f);
+
+	len = bch2_compile_bkey_format(&b->format, b->aux_data);
+	BUG_ON(len < 0 || len > U8_MAX);
+
+	b->unpack_fn_len = len;
+
+	bch2_bset_set_no_aux_tree(b, b->set);
+}
+
+static inline struct bset *bset_next_set(struct btree *b,
+					 unsigned block_bytes)
+{
+	struct bset *i = btree_bset_last(b);
+
+	EBUG_ON(!is_power_of_2(block_bytes));
+
+	return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
+}
+
+void bch2_btree_keys_free(struct btree *);
+int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
+void bch2_btree_keys_init(struct btree *, bool *);
+
+void bch2_bset_init_first(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+			 struct btree_node_entry *);
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
+void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
+				  struct bkey_packed *);
+
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
+		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
+
+/* Bkey utility code */
+
+/* packed or unpacked */
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
+				    const struct bkey_packed *l,
+				    const struct bkey_packed *r_packed,
+				    struct bpos *r)
+{
+	EBUG_ON(r_packed && !bkey_packed(r_packed));
+
+	if (unlikely(!bkey_packed(l)))
+		return bkey_cmp(packed_to_bkey_c(l)->p, *r);
+
+	if (likely(r_packed))
+		return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
+
+	return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
+}
+
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
+					     struct bpos *pos,
+					     const struct bkey_packed *k,
+					     bool strictly_greater)
+{
+	int cmp = bkey_cmp_left_packed(b, k, pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
+					struct bpos pos,
+					const struct bkey_packed *pos_packed,
+					const struct bkey_packed *k,
+					bool strictly_greater)
+{
+	int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
+
+	return cmp > 0 ||
+		(cmp == 0 && !strictly_greater && !bkey_deleted(k));
+}
+
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
+
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
+					  struct bkey_packed *, unsigned);
+
+static inline struct bkey_packed *
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, 0);
+}
+
+static inline struct bkey_packed *
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
+{
+	return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
+}
+
+enum bch_extent_overlap {
+	BCH_EXTENT_OVERLAP_ALL		= 0,
+	BCH_EXTENT_OVERLAP_BACK		= 1,
+	BCH_EXTENT_OVERLAP_FRONT	= 2,
+	BCH_EXTENT_OVERLAP_MIDDLE	= 3,
+};
+
+/* Returns how k overlaps with m */
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
+							 const struct bkey *m)
+{
+	int cmp1 = bkey_cmp(k->p, m->p) < 0;
+	int cmp2 = bkey_cmp(bkey_start_pos(k),
+			    bkey_start_pos(m)) > 0;
+
+	return (cmp1 << 1) + cmp2;
+}
+
+/* Btree key iteration */
+
+static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
+					      bool is_extents)
+{
+	iter->is_extents = is_extents;
+	memset(iter->data, 0, sizeof(iter->data));
+}
+
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
+			      const struct bkey_packed *,
+			      const struct bkey_packed *);
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
+			      struct bpos, bool, bool);
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
+					 struct btree *, bool);
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
+						 struct btree *,
+						 struct bset_tree *);
+
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
+				   struct btree_node_iter_set *);
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
+
+#define btree_node_iter_for_each(_iter, _set)				\
+	for (_set = (_iter)->data;					\
+	     _set < (_iter)->data + ARRAY_SIZE((_iter)->data) &&	\
+	     (_set)->k != (_set)->end;					\
+	     _set++)
+
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
+					     unsigned i)
+{
+	return iter->data[i].k == iter->data[i].end;
+}
+
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
+{
+	return __btree_node_iter_set_end(iter, 0);
+}
+
+static inline int __btree_node_iter_cmp(bool is_extents,
+					struct btree *b,
+					struct bkey_packed *l,
+					struct bkey_packed *r)
+{
+	/*
+	 * For non extents, when keys compare equal the deleted keys have to
+	 * come first - so that bch2_btree_node_iter_next_check() can detect
+	 * duplicate nondeleted keys (and possibly other reasons?)
+	 *
+	 * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
+	 * deleted keys have to sort last.
+	 */
+	return bkey_cmp_packed(b, l, r)
+		?: (is_extents
+		    ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
+		    : (int) bkey_deleted(r) - (int) bkey_deleted(l))
+		?: (l > r) - (l < r);
+}
+
+static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
+				      struct btree *b,
+				      struct btree_node_iter_set l,
+				      struct btree_node_iter_set r)
+{
+	return __btree_node_iter_cmp(iter->is_extents, b,
+			__btree_node_offset_to_key(b, l.k),
+			__btree_node_offset_to_key(b, r.k));
+}
+
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
+			      struct btree *b,
+			      const struct bkey_packed *k,
+			      const struct bkey_packed *end)
+{
+	if (k != end) {
+		struct btree_node_iter_set *pos;
+
+		btree_node_iter_for_each(iter, pos)
+			;
+
+		BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
+		*pos = (struct btree_node_iter_set) {
+			__btree_node_key_to_offset(b, k),
+			__btree_node_key_to_offset(b, end)
+		};
+	}
+}
+
+static inline struct bkey_packed *
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+				struct btree *b)
+{
+	return __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
+				 struct btree *b,
+				 unsigned min_key_type)
+{
+	while (!bch2_btree_node_iter_end(iter)) {
+		struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
+
+		if (k->type >= min_key_type)
+			return k;
+
+		bch2_btree_node_iter_advance(iter, b);
+	}
+
+	return NULL;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
+			      struct btree *b)
+{
+	return bch2_btree_node_iter_peek_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_advance(iter, b);
+
+	return ret;
+}
+
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
+						     struct btree *, unsigned);
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_prev_filter(iter, b, 0);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
+{
+	return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
+}
+
+/*
+ * Iterates over all _live_ keys - skipping deleted (and potentially
+ * overlapping) keys
+ */
+#define for_each_btree_node_key(b, k, iter, _is_extents)		\
+	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     ((k) = bch2_btree_node_iter_peek(iter, b));			\
+	     bch2_btree_node_iter_advance(iter, b))
+
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
+						struct btree *,
+						struct bkey *);
+
+#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
+	for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+	     (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
+	     bch2_btree_node_iter_advance(iter, b))
+
+/* Accounting: */
+
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
+					  unsigned bset,
+					  struct bkey_packed *k,
+					  int sign)
+{
+	n->live_u64s		+= k->u64s * sign;
+	n->bset_u64s[bset]	+= k->u64s * sign;
+
+	if (bkey_packed(k))
+		n->packed_keys	+= sign;
+	else
+		n->unpacked_keys += sign;
+}
+
+#define btree_keys_account_key_add(_nr, _bset_idx, _k)		\
+	btree_keys_account_key(_nr, _bset_idx, _k, 1)
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k)	\
+	btree_keys_account_key(_nr, _bset_idx, _k, -1)
+
+struct bset_stats {
+	struct {
+		size_t nr, bytes;
+	} sets[BSET_TREE_NR_TYPES];
+
+	size_t floats;
+	size_t failed_unpacked;
+	size_t failed_prev;
+	size_t failed_overflow;
+};
+
+void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
+int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
+			  char *, size_t);
+
+/* Debug stuff */
+
+void bch2_dump_bset(struct btree *, struct bset *, unsigned);
+void bch2_dump_btree_node(struct btree *);
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *);
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
+void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
+			  struct bkey_packed *);
+
+#else
+
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+					      struct btree *b) {}
+static inline void bch2_verify_key_order(struct btree *b,
+					struct btree_node_iter *iter,
+					struct bkey_packed *where) {}
+#endif
+
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
+{
+	if (btree_keys_expensive_checks(b))
+		__bch2_verify_btree_nr_keys(b);
+}
+
+#endif /* _BCACHEFS_BSET_H */
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@ -0,0 +1,941 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "trace.h"
+
+#include <linux/prefetch.h>
+
+#define DEF_BTREE_ID(kwd, val, name) name,
+
+const char * const bch2_btree_ids[] = {
+	DEFINE_BCH_BTREE_IDS()
+	NULL
+};
+
+#undef DEF_BTREE_ID
+
+void bch2_recalc_btree_reserve(struct bch_fs *c)
+{
+	unsigned i, reserve = 16;
+
+	if (!c->btree_roots[0].b)
+		reserve += 8;
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			reserve += min_t(unsigned, 1,
+					 c->btree_roots[i].b->level) * 8;
+
+	c->btree_cache.reserve = reserve;
+}
+
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
+{
+	return max_t(int, 0, bc->used - bc->reserve);
+}
+
+static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	EBUG_ON(btree_node_write_in_flight(b));
+
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+	bch2_btree_keys_free(b);
+}
+
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	__btree_node_data_free(c, b);
+	bc->used--;
+	list_move(&b->list, &bc->freed);
+}
+
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
+				   const void *obj)
+{
+	const struct btree *b = obj;
+	const u64 *v = arg->key;
+
+	return PTR_HASH(&b->key) == *v ? 0 : 1;
+}
+
+static const struct rhashtable_params bch_btree_cache_params = {
+	.head_offset	= offsetof(struct btree, hash),
+	.key_offset	= offsetof(struct btree, key.v),
+	.key_len	= sizeof(struct bch_extent_ptr),
+	.obj_cmpfn	= bch2_btree_cache_cmp_fn,
+};
+
+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	b->data = kvpmalloc(btree_bytes(c), gfp);
+	if (!b->data)
+		goto err;
+
+	if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
+		goto err;
+
+	bc->used++;
+	list_move(&b->list, &bc->freeable);
+	return;
+err:
+	kvpfree(b->data, btree_bytes(c));
+	b->data = NULL;
+	list_move(&b->list, &bc->freed);
+}
+
+static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	bkey_extent_init(&b->key);
+	six_lock_init(&b->lock);
+	lockdep_set_novalidate_class(&b->lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_LIST_HEAD(&b->write_blocked);
+
+	btree_node_data_alloc(c, b, gfp);
+	return b->data ? b : NULL;
+}
+
+/* Btree in memory cache - hash table */
+
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+	rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+
+	/* Cause future lookups for this node to fail: */
+	bkey_i_to_extent(&b->key)->v._data[0] = 0;
+}
+
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
+{
+	return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
+					     bch_btree_cache_params);
+}
+
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
+				unsigned level, enum btree_id id)
+{
+	int ret;
+
+	b->level	= level;
+	b->btree_id	= id;
+
+	mutex_lock(&bc->lock);
+	ret = __bch2_btree_node_hash_insert(bc, b);
+	if (!ret)
+		list_add(&b->list, &bc->live);
+	mutex_unlock(&bc->lock);
+
+	return ret;
+}
+
+__flatten
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
+				     const struct bkey_i *k)
+{
+	return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
+				      bch_btree_cache_params);
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	int ret = 0;
+
+	lockdep_assert_held(&bc->lock);
+
+	if (!six_trylock_intent(&b->lock))
+		return -ENOMEM;
+
+	if (!six_trylock_write(&b->lock))
+		goto out_unlock_intent;
+
+	if (btree_node_noevict(b))
+		goto out_unlock;
+
+	if (!btree_node_may_write(b))
+		goto out_unlock;
+
+	if (btree_node_dirty(b) ||
+	    btree_node_write_in_flight(b) ||
+	    btree_node_read_in_flight(b)) {
+		if (!flush)
+			goto out_unlock;
+
+		wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+			       TASK_UNINTERRUPTIBLE);
+
+		/*
+		 * Using the underscore version because we don't want to compact
+		 * bsets after the write, since this node is about to be evicted
+		 * - unless btree verify mode is enabled, since it runs out of
+		 * the post write cleanup:
+		 */
+		if (verify_btree_ondisk(c))
+			bch2_btree_node_write(c, b, SIX_LOCK_intent);
+		else
+			__bch2_btree_node_write(c, b, SIX_LOCK_read);
+
+		/* wait for any in flight btree write */
+		btree_node_wait_on_io(b);
+	}
+out:
+	if (PTR_HASH(&b->key) && !ret)
+		trace_btree_node_reap(c, b);
+	return ret;
+out_unlock:
+	six_unlock_write(&b->lock);
+out_unlock_intent:
+	six_unlock_intent(&b->lock);
+	ret = -ENOMEM;
+	goto out;
+}
+
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, false);
+}
+
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
+{
+	return __btree_node_reclaim(c, b, true);
+}
+
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b, *t;
+	unsigned long nr = sc->nr_to_scan;
+	unsigned long can_free;
+	unsigned long touched = 0;
+	unsigned long freed = 0;
+	unsigned i;
+
+	if (btree_shrinker_disabled(c))
+		return SHRINK_STOP;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_IO)
+		mutex_lock(&bc->lock);
+	else if (!mutex_trylock(&bc->lock))
+		return -1;
+
+	/*
+	 * It's _really_ critical that we don't free too many btree nodes - we
+	 * have to always leave ourselves a reserve. The reserve is how we
+	 * guarantee that allocating memory for a new btree node can always
+	 * succeed, so that inserting keys into the btree can always succeed and
+	 * IO can always make forward progress:
+	 */
+	nr /= btree_pages(c);
+	can_free = btree_cache_can_free(bc);
+	nr = min_t(unsigned long, nr, can_free);
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &bc->freeable, list) {
+		touched++;
+
+		if (freed >= nr)
+			break;
+
+		if (++i > 3 &&
+		    !btree_node_reclaim(c, b)) {
+			btree_node_data_free(c, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			freed++;
+		}
+	}
+restart:
+	list_for_each_entry_safe(b, t, &bc->live, list) {
+		touched++;
+
+		if (freed >= nr) {
+			/* Save position */
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+			break;
+		}
+
+		if (!btree_node_accessed(b) &&
+		    !btree_node_reclaim(c, b)) {
+			/* can't call bch2_btree_node_hash_remove under lock  */
+			freed++;
+			if (&t->list != &bc->live)
+				list_move_tail(&bc->live, &t->list);
+
+			btree_node_data_free(c, b);
+			mutex_unlock(&bc->lock);
+
+			bch2_btree_node_hash_remove(bc, b);
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+
+			if (freed >= nr)
+				goto out;
+
+			if (sc->gfp_mask & __GFP_IO)
+				mutex_lock(&bc->lock);
+			else if (!mutex_trylock(&bc->lock))
+				goto out;
+			goto restart;
+		} else
+			clear_btree_node_accessed(b);
+	}
+
+	mutex_unlock(&bc->lock);
+out:
+	return (unsigned long) freed * btree_pages(c);
+}
+
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc)
+{
+	struct bch_fs *c = container_of(shrink, struct bch_fs,
+					btree_cache.shrink);
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (btree_shrinker_disabled(c))
+		return 0;
+
+	return btree_cache_can_free(bc) * btree_pages(c);
+}
+
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	unsigned i;
+
+	if (bc->shrink.list.next)
+		unregister_shrinker(&bc->shrink);
+
+	mutex_lock(&bc->lock);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &bc->live);
+
+	kvpfree(c->verify_ondisk, btree_bytes(c));
+#endif
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (c->btree_roots[i].b)
+			list_add(&c->btree_roots[i].b->list, &bc->live);
+
+	list_splice(&bc->freeable, &bc->live);
+
+	while (!list_empty(&bc->live)) {
+		b = list_first_entry(&bc->live, struct btree, list);
+
+		BUG_ON(btree_node_read_in_flight(b) ||
+		       btree_node_write_in_flight(b));
+
+		if (btree_node_dirty(b))
+			bch2_btree_complete_write(c, b, btree_current_write(b));
+		clear_btree_node_dirty(b);
+
+		btree_node_data_free(c, b);
+	}
+
+	while (!list_empty(&bc->freed)) {
+		b = list_first_entry(&bc->freed, struct btree, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+
+	mutex_unlock(&bc->lock);
+
+	if (bc->table_init_done)
+		rhashtable_destroy(&bc->table);
+}
+
+int bch2_fs_btree_cache_init(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	unsigned i;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
+	if (ret)
+		goto out;
+
+	bc->table_init_done = true;
+
+	bch2_recalc_btree_reserve(c);
+
+	for (i = 0; i < bc->reserve; i++)
+		if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+	list_splice_init(&bc->live, &bc->freeable);
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+	if (!c->verify_ondisk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
+	if (!c->verify_data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_del_init(&c->verify_data->list);
+#endif
+
+	bc->shrink.count_objects	= bch2_btree_cache_count;
+	bc->shrink.scan_objects		= bch2_btree_cache_scan;
+	bc->shrink.seeks		= 4;
+	bc->shrink.batch		= btree_pages(c) * 2;
+	register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
+{
+	mutex_init(&bc->lock);
+	INIT_LIST_HEAD(&bc->live);
+	INIT_LIST_HEAD(&bc->freeable);
+	INIT_LIST_HEAD(&bc->freed);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+
+	if (bc->alloc_lock == current) {
+		trace_btree_node_cannibalize_unlock(c);
+		bc->alloc_lock = NULL;
+		closure_wake_up(&bc->alloc_wait);
+	}
+}
+
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct task_struct *old;
+
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current)
+		goto success;
+
+	if (!cl) {
+		trace_btree_node_cannibalize_lock_fail(c);
+		return -ENOMEM;
+	}
+
+	closure_wait(&bc->alloc_wait, cl);
+
+	/* Try again, after adding ourselves to waitlist */
+	old = cmpxchg(&bc->alloc_lock, NULL, current);
+	if (old == NULL || old == current) {
+		/* We raced */
+		closure_wake_up(&bc->alloc_wait);
+		goto success;
+	}
+
+	trace_btree_node_cannibalize_lock_fail(c);
+	return -EAGAIN;
+
+success:
+	trace_btree_node_cannibalize_lock(c);
+	return 0;
+}
+
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	list_for_each_entry_reverse(b, &bc->live, list)
+		if (!btree_node_reclaim(c, b))
+			return b;
+
+	while (1) {
+		list_for_each_entry_reverse(b, &bc->live, list)
+			if (!btree_node_write_and_reclaim(c, b))
+				return b;
+
+		/*
+		 * Rare case: all nodes were intent-locked.
+		 * Just busy-wait.
+		 */
+		WARN_ONCE(1, "btree cache cannibalize failed\n");
+		cond_resched();
+	}
+}
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	u64 start_time = local_clock();
+
+	mutex_lock(&bc->lock);
+
+	/*
+	 * btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &bc->freeable, list)
+		if (!btree_node_reclaim(c, b))
+			goto out_unlock;
+
+	/*
+	 * We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &bc->freed, list)
+		if (!btree_node_reclaim(c, b)) {
+			btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
+			if (b->data)
+				goto out_unlock;
+
+			six_unlock_write(&b->lock);
+			six_unlock_intent(&b->lock);
+			goto err;
+		}
+
+	b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!six_trylock_intent(&b->lock));
+	BUG_ON(!six_trylock_write(&b->lock));
+out_unlock:
+	BUG_ON(btree_node_hashed(b));
+	BUG_ON(btree_node_write_in_flight(b));
+
+	list_del_init(&b->list);
+	mutex_unlock(&bc->lock);
+out:
+	b->flags		= 0;
+	b->written		= 0;
+	b->nsets		= 0;
+	b->sib_u64s[0]		= 0;
+	b->sib_u64s[1]		= 0;
+	b->whiteout_u64s	= 0;
+	b->uncompacted_whiteout_u64s = 0;
+	bch2_btree_keys_init(b, &c->expensive_debug_checks);
+
+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+			       start_time);
+
+	return b;
+err:
+	/* Try to cannibalize another cached btree node: */
+	if (bc->alloc_lock == current) {
+		b = btree_node_cannibalize(c);
+		list_del_init(&b->list);
+		mutex_unlock(&bc->lock);
+
+		bch2_btree_node_hash_remove(bc, b);
+
+		trace_btree_node_cannibalize(c);
+		goto out;
+	}
+
+	mutex_unlock(&bc->lock);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
+				struct btree_iter *iter,
+				const struct bkey_i *k,
+				unsigned level,
+				enum six_lock_type lock_type,
+				bool sync)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	/*
+	 * Parent node must be locked, else we could read in a btree node that's
+	 * been freed:
+	 */
+	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	b = bch2_btree_node_mem_alloc(c);
+	if (IS_ERR(b))
+		return b;
+
+	bkey_copy(&b->key, k);
+	if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+		/* raced with another fill: */
+
+		/* mark as unhashed... */
+		bkey_i_to_extent(&b->key)->v._data[0] = 0;
+
+		mutex_lock(&bc->lock);
+		list_add(&b->list, &bc->freeable);
+		mutex_unlock(&bc->lock);
+
+		six_unlock_write(&b->lock);
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	/*
+	 * If the btree node wasn't cached, we can't drop our lock on
+	 * the parent until after it's added to the cache - because
+	 * otherwise we could race with a btree_split() freeing the node
+	 * we're trying to lock.
+	 *
+	 * But the deadlock described below doesn't exist in this case,
+	 * so it's safe to not drop the parent lock until here:
+	 */
+	if (btree_node_read_locked(iter, level + 1))
+		btree_node_unlock(iter, level + 1);
+
+	bch2_btree_node_read(c, b, sync);
+
+	six_unlock_write(&b->lock);
+
+	if (!sync) {
+		six_unlock_intent(&b->lock);
+		return NULL;
+	}
+
+	if (lock_type == SIX_LOCK_read)
+		six_lock_downgrade(&b->lock);
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * the @write parameter.
+ */
+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
+				  const struct bkey_i *k, unsigned level,
+				  enum six_lock_type lock_type,
+				  bool may_drop_locks)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+	struct bset_tree *t;
+
+	/*
+	 * XXX: locking optimization
+	 *
+	 * we can make the locking looser here - caller can drop lock on parent
+	 * node before locking child node (and potentially blocking): we just
+	 * have to have bch2_btree_node_fill() call relock on the parent and
+	 * return -EINTR if that fails
+	 */
+	EBUG_ON(!btree_node_locked(iter, level + 1));
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+retry:
+	rcu_read_lock();
+	b = btree_cache_find(bc, k);
+	rcu_read_unlock();
+
+	if (unlikely(!b)) {
+		/*
+		 * We must have the parent locked to call bch2_btree_node_fill(),
+		 * else we could read in a btree node from disk that's been
+		 * freed:
+		 */
+		b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+
+		/* We raced and found the btree node in the cache */
+		if (!b)
+			goto retry;
+
+		if (IS_ERR(b))
+			return b;
+	} else {
+		/*
+		 * There's a potential deadlock with splits and insertions into
+		 * interior nodes we have to avoid:
+		 *
+		 * The other thread might be holding an intent lock on the node
+		 * we want, and they want to update its parent node so they're
+		 * going to upgrade their intent lock on the parent node to a
+		 * write lock.
+		 *
+		 * But if we're holding a read lock on the parent, and we're
+		 * trying to get the intent lock they're holding, we deadlock.
+		 *
+		 * So to avoid this we drop the read locks on parent nodes when
+		 * we're starting to take intent locks - and handle the race.
+		 *
+		 * The race is that they might be about to free the node we
+		 * want, and dropping our read lock on the parent node lets them
+		 * update the parent marking the node we want as freed, and then
+		 * free it:
+		 *
+		 * To guard against this, btree nodes are evicted from the cache
+		 * when they're freed - and PTR_HASH() is zeroed out, which we
+		 * check for after we lock the node.
+		 *
+		 * Then, bch2_btree_node_relock() on the parent will fail - because
+		 * the parent was modified, when the pointer to the node we want
+		 * was removed - and we'll bail out:
+		 */
+		if (btree_node_read_locked(iter, level + 1))
+			btree_node_unlock(iter, level + 1);
+
+		if (!btree_node_lock(b, k->k.p, level, iter,
+				     lock_type, may_drop_locks))
+			return ERR_PTR(-EINTR);
+
+		if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+			     b->level != level ||
+			     race_fault())) {
+			six_unlock_type(&b->lock, lock_type);
+			if (bch2_btree_node_relock(iter, level + 1))
+				goto retry;
+
+			return ERR_PTR(-EINTR);
+		}
+	}
+
+	wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+
+	prefetch(b->aux_data);
+
+	for_each_bset(b, t) {
+		void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+		prefetch(p + L1_CACHE_BYTES * 0);
+		prefetch(p + L1_CACHE_BYTES * 1);
+		prefetch(p + L1_CACHE_BYTES * 2);
+	}
+
+	/* avoid atomic set bit if it's not needed: */
+	if (btree_node_accessed(b))
+		set_btree_node_accessed(b);
+
+	if (unlikely(btree_node_read_error(b))) {
+		six_unlock_type(&b->lock, lock_type);
+		return ERR_PTR(-EIO);
+	}
+
+	EBUG_ON(b->btree_id != iter->btree_id ||
+		BTREE_NODE_LEVEL(b->data) != level ||
+		bkey_cmp(b->data->max_key, k->k.p));
+
+	return b;
+}
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
+					  struct btree_iter *iter,
+					  struct btree *b,
+					  bool may_drop_locks,
+					  enum btree_node_sibling sib)
+{
+	struct btree *parent;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	struct btree *ret = NULL;
+	unsigned level = b->level;
+
+	parent = btree_iter_node(iter, level + 1);
+	if (!parent)
+		return NULL;
+
+	if (!bch2_btree_node_relock(iter, level + 1))
+		goto out_upgrade;
+
+	node_iter = iter->l[parent->level].iter;
+
+	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
+	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+	k = sib == btree_prev_sib
+		? bch2_btree_node_iter_prev(&node_iter, parent)
+		: (bch2_btree_node_iter_advance(&node_iter, parent),
+		   bch2_btree_node_iter_peek(&node_iter, parent));
+	if (!k)
+		goto out;
+
+	bch2_bkey_unpack(parent, &tmp.k, k);
+
+	ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+				  SIX_LOCK_intent, may_drop_locks);
+
+	if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
+		struct btree_iter *linked;
+
+		if (!bch2_btree_node_relock(iter, level + 1))
+			goto out_upgrade;
+
+		/*
+		 * We might have got -EINTR because trylock failed, and we're
+		 * holding other locks that would cause us to deadlock:
+		 */
+		for_each_linked_btree_iter(iter, linked)
+			if (btree_iter_cmp(iter, linked) < 0)
+				__bch2_btree_iter_unlock(linked);
+
+		if (sib == btree_prev_sib)
+			btree_node_unlock(iter, level);
+
+		ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+					  SIX_LOCK_intent, may_drop_locks);
+
+		/*
+		 * before btree_iter_relock() calls btree_iter_verify_locks():
+		 */
+		if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+			btree_node_unlock(iter, level + 1);
+
+		if (!bch2_btree_node_relock(iter, level)) {
+			btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+			if (!IS_ERR(ret)) {
+				six_unlock_intent(&ret->lock);
+				ret = ERR_PTR(-EINTR);
+			}
+		}
+
+		bch2_btree_iter_relock(iter);
+	}
+out:
+	if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+		btree_node_unlock(iter, level + 1);
+
+	bch2_btree_iter_verify_locks(iter);
+
+	BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
+	       (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
+		!btree_node_locked(iter, level)));
+
+	if (!IS_ERR_OR_NULL(ret)) {
+		struct btree *n1 = ret, *n2 = b;
+
+		if (sib != btree_prev_sib)
+			swap(n1, n2);
+
+		BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
+						     n1->key.k.p),
+				n2->data->min_key));
+	}
+
+	return ret;
+out_upgrade:
+	if (may_drop_locks)
+		bch2_btree_iter_upgrade(iter, level + 2, true);
+	ret = ERR_PTR(-EINTR);
+	goto out;
+}
+
+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+			      const struct bkey_i *k, unsigned level)
+{
+	struct btree_cache *bc = &c->btree_cache;
+	struct btree *b;
+
+	BUG_ON(!btree_node_locked(iter, level + 1));
+	BUG_ON(level >= BTREE_MAX_DEPTH);
+
+	rcu_read_lock();
+	b = btree_cache_find(bc, k);
+	rcu_read_unlock();
+
+	if (b)
+		return;
+
+	bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+}
+
+int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
+			  char *buf, size_t len)
+{
+	const struct bkey_format *f = &b->format;
+	struct bset_stats stats;
+	char ptrs[100];
+
+	memset(&stats, 0, sizeof(stats));
+
+	bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
+			bkey_i_to_s_c(&b->key));
+	bch2_btree_keys_stats(b, &stats);
+
+	return scnprintf(buf, len,
+			 "l %u %llu:%llu - %llu:%llu:\n"
+			 "    ptrs: %s\n"
+			 "    format: u64s %u fields %u %u %u %u %u\n"
+			 "    unpack fn len: %u\n"
+			 "    bytes used %zu/%zu (%zu%% full)\n"
+			 "    sib u64s: %u, %u (merge threshold %zu)\n"
+			 "    nr packed keys %u\n"
+			 "    nr unpacked keys %u\n"
+			 "    floats %zu\n"
+			 "    failed unpacked %zu\n"
+			 "    failed prev %zu\n"
+			 "    failed overflow %zu\n",
+			 b->level,
+			 b->data->min_key.inode,
+			 b->data->min_key.offset,
+			 b->data->max_key.inode,
+			 b->data->max_key.offset,
+			 ptrs,
+			 f->key_u64s,
+			 f->bits_per_field[0],
+			 f->bits_per_field[1],
+			 f->bits_per_field[2],
+			 f->bits_per_field[3],
+			 f->bits_per_field[4],
+			 b->unpack_fn_len,
+			 b->nr.live_u64s * sizeof(u64),
+			 btree_bytes(c) - sizeof(struct btree_node),
+			 b->nr.live_u64s * 100 / btree_max_u64s(c),
+			 b->sib_u64s[0],
+			 b->sib_u64s[1],
+			 BTREE_FOREGROUND_MERGE_THRESHOLD(c),
+			 b->nr.packed_keys,
+			 b->nr.unpacked_keys,
+			 stats.floats,
+			 stats.failed_unpacked,
+			 stats.failed_prev,
+			 stats.failed_overflow);
+}
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_CACHE_H
+#define _BCACHEFS_BTREE_CACHE_H
+
+#include "bcachefs.h"
+#include "btree_types.h"
+#include "extents.h"
+
+struct btree_iter;
+
+extern const char * const bch2_btree_ids[];
+
+void bch2_recalc_btree_reserve(struct bch_fs *);
+
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
+				unsigned, enum btree_id);
+
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
+
+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+				  const struct bkey_i *, unsigned,
+				  enum six_lock_type, bool);
+
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
+					  struct btree *, bool,
+					  enum btree_node_sibling);
+
+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+			      const struct bkey_i *, unsigned);
+
+void bch2_fs_btree_cache_exit(struct bch_fs *);
+int bch2_fs_btree_cache_init(struct bch_fs *);
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
+
+#define PTR_HASH(_k)	(bkey_i_to_extent_c(_k)->v._data[0])
+
+/* is btree node in hash table? */
+static inline bool btree_node_hashed(struct btree *b)
+{
+	return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
+}
+
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)		\
+	for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl,	\
+					  &(_c)->btree_cache.table),	\
+	     _iter = 0;	_iter < (_tbl)->size; _iter++)			\
+		rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
+
+static inline size_t btree_bytes(struct bch_fs *c)
+{
+	return c->opts.btree_node_size << 9;
+}
+
+static inline size_t btree_max_u64s(struct bch_fs *c)
+{
+	return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+}
+
+static inline size_t btree_page_order(struct bch_fs *c)
+{
+	return get_order(btree_bytes(c));
+}
+
+static inline size_t btree_pages(struct bch_fs *c)
+{
+	return 1 << btree_page_order(c);
+}
+
+static inline unsigned btree_blocks(struct bch_fs *c)
+{
+	return c->opts.btree_node_size >> c->block_bits;
+}
+
+#define BTREE_SPLIT_THRESHOLD(c)		(btree_blocks(c) * 3 / 4)
+
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c)	(btree_max_u64s(c) * 1 / 3)
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c)			\
+	(BTREE_FOREGROUND_MERGE_THRESHOLD(c) +			\
+	 (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
+
+#define btree_node_root(_c, _b)	((_c)->btree_roots[(_b)->btree_id].b)
+
+int bch2_print_btree_node(struct bch_fs *, struct btree *,
+			 char *, size_t);
+
+#endif /* _BCACHEFS_BTREE_CACHE_H */
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_GC_H
+#define _BCACHEFS_BTREE_GC_H
+
+#include "btree_types.h"
+
+enum bkey_type;
+
+void bch2_coalesce(struct bch_fs *);
+void bch2_gc(struct bch_fs *);
+void bch2_gc_thread_stop(struct bch_fs *);
+int bch2_gc_thread_start(struct bch_fs *);
+int bch2_initial_gc(struct bch_fs *, struct list_head *);
+u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
+int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
+				struct bkey_s_c);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
+
+/*
+ * For concurrent mark and sweep (with other index updates), we define a total
+ * ordering of _all_ references GC walks:
+ *
+ * Note that some references will have the same GC position as others - e.g.
+ * everything within the same btree node; in those cases we're relying on
+ * whatever locking exists for where those references live, i.e. the write lock
+ * on a btree node.
+ *
+ * That locking is also required to ensure GC doesn't pass the updater in
+ * between the updater adding/removing the reference and updating the GC marks;
+ * without that, we would at best double count sometimes.
+ *
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
+ * be held that prevents GC from passing the position the updater is at.
+ *
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
+ * position inside its cmpxchg loop, so crap magically works).
+ */
+
+/* Position of (the start of) a gc phase: */
+static inline struct gc_pos gc_phase(enum gc_phase phase)
+{
+	return (struct gc_pos) {
+		.phase	= phase,
+		.pos	= POS_MIN,
+		.level	= 0,
+	};
+}
+
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
+{
+	if (l.phase != r.phase)
+		return l.phase < r.phase ? -1 : 1;
+	if (bkey_cmp(l.pos, r.pos))
+		return bkey_cmp(l.pos, r.pos);
+	if (l.level != r.level)
+		return l.level < r.level ? -1 : 1;
+	return 0;
+}
+
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+					 struct bpos pos, unsigned level)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_BTREE_EXTENTS + id,
+		.pos	= pos,
+		.level	= level,
+	};
+}
+
+/*
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
+ * itself, that lives in the parent node:
+ */
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
+{
+	return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
+}
+
+/*
+ * GC position of the pointer to a btree root: we don't use
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
+ * btree_split() increasing the tree depth - the new root will have level > the
+ * old root and thus have a greater gc position than the old root, but that
+ * would be incorrect since once gc has marked the root it's not coming back.
+ */
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
+{
+	return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
+}
+
+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_ALLOC,
+		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
+	};
+}
+
+static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+{
+	unsigned seq;
+	bool ret;
+
+	do {
+		seq = read_seqcount_begin(&c->gc_pos_lock);
+		ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+	} while (read_seqcount_retry(&c->gc_pos_lock, seq));
+
+	return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_GC_H */
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_IO_H
+#define _BCACHEFS_BTREE_IO_H
+
+#include "bset.h"
+#include "extents.h"
+#include "io_types.h"
+
+struct bch_fs;
+struct btree_write;
+struct btree;
+struct btree_iter;
+
+struct btree_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	unsigned		have_ioref:1;
+	struct extent_pick_ptr	pick;
+	struct work_struct	work;
+	struct bio		bio;
+};
+
+struct btree_write_bio {
+	void			*data;
+	struct work_struct	work;
+	struct bch_write_bio	wbio;
+};
+
+static inline void btree_node_io_unlock(struct btree *b)
+{
+	EBUG_ON(!btree_node_write_in_flight(b));
+	clear_btree_node_write_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+}
+
+static inline void btree_node_io_lock(struct btree *b)
+{
+	wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
+			    TASK_UNINTERRUPTIBLE);
+}
+
+static inline void btree_node_wait_on_io(struct btree *b)
+{
+	wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
+		       TASK_UNINTERRUPTIBLE);
+}
+
+static inline bool btree_node_may_write(struct btree *b)
+{
+	return list_empty_careful(&b->write_blocked) &&
+		!b->will_make_reachable;
+}
+
+enum compact_mode {
+	COMPACT_LAZY,
+	COMPACT_WRITTEN,
+	COMPACT_WRITTEN_NO_WRITE_LOCK,
+};
+
+bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
+
+static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
+{
+	unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
+	unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+
+	return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+}
+
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
+{
+	struct bset_tree *t;
+
+	for_each_bset(b, t)
+		if (should_compact_bset_lazy(b, t))
+			return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
+
+	return false;
+}
+
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
+
+void bch2_btree_build_aux_trees(struct btree *);
+void bch2_btree_init_next(struct bch_fs *, struct btree *,
+			 struct btree_iter *);
+
+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
+			 const struct bkey_i *, unsigned);
+
+void bch2_btree_complete_write(struct bch_fs *, struct btree *,
+			      struct btree_write *);
+void bch2_btree_write_error_work(struct work_struct *);
+
+void __bch2_btree_node_write(struct bch_fs *, struct btree *,
+			    enum six_lock_type);
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
+
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
+			  enum six_lock_type);
+
+/*
+ * btree_node_dirty() can be cleared with only a read lock,
+ * and for bch2_btree_node_write_cond() we want to set need_write iff it's
+ * still dirty:
+ */
+static inline void set_btree_node_need_write_if_dirty(struct btree *b)
+{
+	unsigned long old, new, v = READ_ONCE(b->flags);
+
+	do {
+		old = new = v;
+
+		if (!(old & (1 << BTREE_NODE_dirty)))
+			return;
+
+		new |= (1 << BTREE_NODE_need_write);
+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
+}
+
+#define bch2_btree_node_write_cond(_c, _b, cond)			\
+do {									\
+	while ((_b)->written && btree_node_dirty(_b) &&	(cond)) {	\
+		if (!btree_node_may_write(_b)) {			\
+			set_btree_node_need_write_if_dirty(_b);		\
+			break;						\
+		}							\
+									\
+		if (!btree_node_write_in_flight(_b)) {			\
+			bch2_btree_node_write(_c, _b, SIX_LOCK_read);	\
+			break;						\
+		}							\
+									\
+		six_unlock_read(&(_b)->lock);				\
+		btree_node_wait_on_io(_b);				\
+		btree_node_lock_type(c, b, SIX_LOCK_read);		\
+	}								\
+} while (0)
+
+void bch2_btree_flush_all_reads(struct bch_fs *);
+void bch2_btree_flush_all_writes(struct bch_fs *);
+void bch2_btree_verify_flushed(struct bch_fs *);
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
+
+/* Sorting */
+
+struct btree_node_iter_large {
+	u8		is_extents;
+	u16		used;
+
+	struct btree_node_iter_set data[MAX_BSETS];
+};
+
+static inline void
+__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
+				  bool is_extents)
+{
+	iter->used = 0;
+	iter->is_extents = is_extents;
+}
+
+void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
+					struct btree *);
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
+				     struct btree *,
+				     const struct bkey_packed *,
+				     const struct bkey_packed *);
+
+static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
+{
+	return !iter->used;
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	return bch2_btree_node_iter_large_end(iter)
+		? NULL
+		: __btree_node_offset_to_key(b, iter->data->k);
+}
+
+static inline struct bkey_packed *
+bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
+				    struct btree *b)
+{
+	struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
+
+	if (ret)
+		bch2_btree_node_iter_large_advance(iter, b);
+
+	return ret;
+}
+
+#endif /* _BCACHEFS_BTREE_IO_H */
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_ITER_H
+#define _BCACHEFS_BTREE_ITER_H
+
+#include "btree_types.h"
+
+static inline void btree_iter_set_dirty(struct btree_iter *iter,
+					enum btree_iter_uptodate u)
+{
+	iter->uptodate = max_t(unsigned, iter->uptodate, u);
+}
+
+static inline struct btree *btree_iter_node(struct btree_iter *iter,
+					    unsigned level)
+{
+	return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+}
+
+static inline struct btree *btree_node_parent(struct btree_iter *iter,
+					      struct btree *b)
+{
+	return btree_iter_node(iter, b->level + 1);
+}
+
+static inline bool btree_iter_linked(const struct btree_iter *iter)
+{
+	return iter->next != iter;
+}
+
+static inline bool __iter_has_node(const struct btree_iter *iter,
+				   const struct btree *b)
+{
+	/*
+	 * We don't compare the low bits of the lock sequence numbers because
+	 * @iter might have taken a write lock on @b, and we don't want to skip
+	 * the linked iterator if the sequence numbers were equal before taking
+	 * that write lock. The lock sequence number is incremented by taking
+	 * and releasing write locks and is even when unlocked:
+	 */
+
+	return iter->l[b->level].b == b &&
+		iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+}
+
+static inline struct btree_iter *
+__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+{
+	return linked->next != iter ? linked->next : NULL;
+}
+
+static inline struct btree_iter *
+__next_iter_with_node(struct btree_iter *iter, struct btree *b,
+		      struct btree_iter *linked)
+{
+	while (linked && !__iter_has_node(linked, b))
+		linked = __next_linked_iter(iter, linked);
+
+	return linked;
+}
+
+/**
+ * for_each_btree_iter - iterate over all iterators linked with @_iter,
+ * including @_iter
+ */
+#define for_each_btree_iter(_iter, _linked)				\
+	for ((_linked) = (_iter); (_linked);				\
+	     (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
+ * that also point to @_b
+ *
+ * @_b is assumed to be locked by @_iter
+ *
+ * Filters out iterators that don't have a valid btree_node iterator for @_b -
+ * i.e. iterators for which bch2_btree_node_relock() would not succeed.
+ */
+#define for_each_btree_iter_with_node(_iter, _b, _linked)		\
+	for ((_linked) = (_iter);					\
+	     ((_linked) = __next_iter_with_node(_iter, _b, _linked));	\
+	     (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
+ * _not_ including @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked)			\
+	for ((_linked) = (_iter)->next;					\
+	     (_linked) != (_iter);					\
+	     (_linked) = (_linked)->next)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_iter_verify_locks(struct btree_iter *);
+#else
+static inline void bch2_btree_iter_verify(struct btree_iter *iter,
+					  struct btree *b) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+#endif
+
+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
+			     struct btree_node_iter *, struct bset_tree *,
+			     struct bkey_packed *, unsigned, unsigned);
+
+int bch2_btree_iter_unlock(struct btree_iter *);
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+					   unsigned new_locks_want,
+					   bool may_drop_locks)
+{
+	new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
+
+	return iter->locks_want < new_locks_want
+		? (may_drop_locks
+		   ? __bch2_btree_iter_upgrade(iter, new_locks_want)
+		   : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
+		: iter->uptodate <= BTREE_ITER_NEED_PEEK;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+{
+	if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
+		__bch2_btree_iter_downgrade(iter, 0);
+}
+
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
+
+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
+
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
+
+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
+
+void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
+			   enum btree_id, struct bpos,
+			   unsigned , unsigned, unsigned);
+
+static inline void bch2_btree_iter_init(struct btree_iter *iter,
+			struct bch_fs *c, enum btree_id btree_id,
+			struct bpos pos, unsigned flags)
+{
+	__bch2_btree_iter_init(iter, c, btree_id, pos,
+			       flags & BTREE_ITER_INTENT ? 1 : 0, 0,
+			       (btree_id == BTREE_ID_EXTENTS
+				?  BTREE_ITER_IS_EXTENTS : 0)|flags);
+}
+
+void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch2_btree_iter_unlink(struct btree_iter *);
+void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
+
+static inline struct bpos btree_type_successor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		pos.inode++;
+		pos.offset = 0;
+	} else if (id != BTREE_ID_EXTENTS) {
+		pos = bkey_successor(pos);
+	}
+
+	return pos;
+}
+
+static inline struct bpos btree_type_predecessor(enum btree_id id,
+					       struct bpos pos)
+{
+	if (id == BTREE_ID_INODES) {
+		--pos.inode;
+		pos.offset = 0;
+	} else /* if (id != BTREE_ID_EXTENTS) */ {
+		pos = bkey_predecessor(pos);
+	}
+
+	return pos;
+}
+
+static inline int __btree_iter_cmp(enum btree_id id,
+				   struct bpos pos,
+				   const struct btree_iter *r)
+{
+	if (id != r->btree_id)
+		return id < r->btree_id ? -1 : 1;
+	return bkey_cmp(pos, r->pos);
+}
+
+static inline int btree_iter_cmp(const struct btree_iter *l,
+				 const struct btree_iter *r)
+{
+	return __btree_iter_cmp(l->btree_id, l->pos, r);
+}
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+{
+	if (need_resched()) {
+		bch2_btree_iter_unlock(iter);
+		schedule();
+	} else if (race_fault()) {
+		bch2_btree_iter_unlock(iter);
+	}
+}
+
+#define __for_each_btree_node(_iter, _c, _btree_id, _start,		\
+			      _locks_want, _depth, _flags, _b)		\
+	for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start,	\
+				    _locks_want, _depth,		\
+				    _flags|BTREE_ITER_NODES),		\
+	     _b = bch2_btree_iter_peek_node(_iter);			\
+	     (_b);							\
+	     (_b) = bch2_btree_iter_next_node(_iter, _depth))
+
+#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b)	\
+	__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
+
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+						     unsigned flags)
+{
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_peek_slot(iter)
+		: bch2_btree_iter_peek(iter);
+}
+
+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
+						     unsigned flags)
+{
+	bch2_btree_iter_cond_resched(iter);
+
+	return flags & BTREE_ITER_SLOTS
+		? bch2_btree_iter_next_slot(iter)
+		: bch2_btree_iter_next(iter);
+}
+
+#define for_each_btree_key(_iter, _c, _btree_id,  _start, _flags, _k)	\
+	for (bch2_btree_iter_init((_iter), (_c), (_btree_id),		\
+				  (_start), (_flags)),			\
+	     (_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	     !IS_ERR_OR_NULL((_k).k);					\
+	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+#define for_each_btree_key_continue(_iter, _flags, _k)			\
+	for ((_k) = __bch2_btree_iter_peek(_iter, _flags);		\
+	     !IS_ERR_OR_NULL((_k).k);					\
+	     (_k) = __bch2_btree_iter_next(_iter, _flags))
+
+static inline int btree_iter_err(struct bkey_s_c k)
+{
+	return PTR_ERR_OR_ZERO(k.k);
+}
+
+/* new multiple iterator interface: */
+
+int bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_iter_free(struct btree_trans *,
+				struct btree_iter *);
+
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
+					 struct bpos, unsigned, u64);
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
+					  struct btree_iter *, u64);
+
+static __always_inline u64 __btree_iter_id(void)
+{
+	u64 ret = 0;
+
+	ret <<= 32;
+	ret |= _RET_IP_ & U32_MAX;
+	ret <<= 32;
+	ret |= _THIS_IP_ & U32_MAX;
+	return ret;
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
+		    struct bpos pos, unsigned flags)
+{
+	return __bch2_trans_get_iter(trans, btree_id, pos, flags,
+				     __btree_iter_id());
+}
+
+static __always_inline struct btree_iter *
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
+{
+
+	return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
+}
+
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+int bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_begin(struct btree_trans *);
+void bch2_trans_init(struct btree_trans *, struct bch_fs *);
+int bch2_trans_exit(struct btree_trans *);
+
+#endif /* _BCACHEFS_BTREE_ITER_H */
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_LOCKING_H
+#define _BCACHEFS_BTREE_LOCKING_H
+
+/*
+ * Only for internal btree use:
+ *
+ * The btree iterator tracks what locks it wants to take, and what locks it
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
+ * updating the iterator state
+ */
+
+#include "btree_iter.h"
+#include "btree_io.h"
+#include "six.h"
+
+/* matches six lock types */
+enum btree_node_locked_type {
+	BTREE_NODE_UNLOCKED		= -1,
+	BTREE_NODE_READ_LOCKED		= SIX_LOCK_read,
+	BTREE_NODE_INTENT_LOCKED	= SIX_LOCK_intent,
+};
+
+static inline int btree_node_locked_type(struct btree_iter *iter,
+					 unsigned level)
+{
+	/*
+	 * We're relying on the fact that if nodes_intent_locked is set
+	 * nodes_locked must be set as well, so that we can compute without
+	 * branches:
+	 */
+	return BTREE_NODE_UNLOCKED +
+		((iter->nodes_locked >> level) & 1) +
+		((iter->nodes_intent_locked >> level) & 1);
+}
+
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
+					    unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+}
+
+static inline bool btree_node_read_locked(struct btree_iter *iter,
+					  unsigned level)
+{
+	return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+}
+
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+{
+	return iter->nodes_locked & (1 << level);
+}
+
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+					    unsigned level)
+{
+	iter->nodes_locked &= ~(1 << level);
+	iter->nodes_intent_locked &= ~(1 << level);
+}
+
+static inline void mark_btree_node_locked(struct btree_iter *iter,
+					  unsigned level,
+					  enum six_lock_type type)
+{
+	/* relying on this to avoid a branch */
+	BUILD_BUG_ON(SIX_LOCK_read   != 0);
+	BUILD_BUG_ON(SIX_LOCK_intent != 1);
+
+	iter->nodes_locked |= 1 << level;
+	iter->nodes_intent_locked |= type << level;
+}
+
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+						 unsigned level)
+{
+	mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+}
+
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+{
+	return level < iter->locks_want
+		? SIX_LOCK_intent
+		: SIX_LOCK_read;
+}
+
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_iter *iter, int level)
+{
+	if (level < iter->level)
+		return BTREE_NODE_UNLOCKED;
+	if (level < iter->locks_want)
+		return BTREE_NODE_INTENT_LOCKED;
+	if (level == iter->level)
+		return BTREE_NODE_READ_LOCKED;
+	return BTREE_NODE_UNLOCKED;
+}
+
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+{
+	int lock_type = btree_node_locked_type(iter, level);
+
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	if (lock_type != BTREE_NODE_UNLOCKED)
+		six_unlock_type(&iter->l[level].b->lock, lock_type);
+	mark_btree_node_unlocked(iter, level);
+}
+
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+	btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+	while (iter->nodes_locked)
+		btree_node_unlock(iter, __ffs(iter->nodes_locked));
+}
+
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+	switch (type) {
+	case SIX_LOCK_read:
+		return BCH_TIME_btree_lock_contended_read;
+	case SIX_LOCK_intent:
+		return BCH_TIME_btree_lock_contended_intent;
+	case SIX_LOCK_write:
+		return BCH_TIME_btree_lock_contended_write;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					  enum six_lock_type type)
+{
+	u64 start_time = local_clock();
+
+	six_lock_type(&b->lock, type, NULL, NULL);
+	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+					enum six_lock_type type)
+{
+	if (!six_trylock_type(&b->lock, type))
+		__btree_node_lock_type(c, b, type);
+}
+
+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
+			    struct btree_iter *, enum six_lock_type, bool);
+
+static inline bool btree_node_lock(struct btree *b, struct bpos pos,
+				   unsigned level,
+				   struct btree_iter *iter,
+				   enum six_lock_type type,
+				   bool may_drop_locks)
+{
+	EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+	return likely(six_trylock_type(&b->lock, type)) ||
+		__bch2_btree_node_lock(b, pos, level, iter,
+				       type, may_drop_locks);
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_node_relock(struct btree_iter *iter,
+					  unsigned level)
+{
+	EBUG_ON(btree_node_locked(iter, level) &&
+		btree_node_locked_type(iter, level) !=
+		__btree_lock_want(iter, level));
+
+	return likely(btree_node_locked(iter, level)) ||
+		__bch2_btree_node_relock(iter, level);
+}
+
+bool bch2_btree_iter_relock(struct btree_iter *);
+
+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+	EBUG_ON(iter->l[b->level].b != b);
+	EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+	if (!six_trylock_write(&b->lock))
+		__bch2_btree_node_lock_write(b, iter);
+}
+
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
+
+
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@ -0,0 +1,479 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_TYPES_H
+#define _BCACHEFS_BTREE_TYPES_H
+
+#include <linux/list.h>
+#include <linux/rhashtable.h>
+
+#include "bkey_methods.h"
+#include "journal_types.h"
+#include "six.h"
+
+struct open_bucket;
+struct btree_update;
+
+#define MAX_BSETS		3U
+
+struct btree_nr_keys {
+
+	/*
+	 * Amount of live metadata (i.e. size of node after a compaction) in
+	 * units of u64s
+	 */
+	u16			live_u64s;
+	u16			bset_u64s[MAX_BSETS];
+
+	/* live keys only: */
+	u16			packed_keys;
+	u16			unpacked_keys;
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	u16			size;
+
+	/* function of size - precalculated for to_inorder() */
+	u16			extra;
+
+	u16			data_offset;
+	u16			aux_data_offset;
+	u16			end_offset;
+
+	struct bpos		max_key;
+};
+
+struct btree_write {
+	struct journal_entry_pin	journal;
+	struct closure_waitlist		wait;
+};
+
+struct btree_ob_ref {
+	u8			nr;
+	u8			refs[BCH_REPLICAS_MAX];
+};
+
+struct btree_alloc {
+	struct btree_ob_ref	ob;
+	BKEY_PADDED(k);
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct rhash_head	hash;
+
+	/* Key/pointer for this btree node */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
+	struct six_lock		lock;
+
+	unsigned long		flags;
+	u16			written;
+	u8			level;
+	u8			btree_id;
+	u8			nsets;
+	u8			nr_key_bits;
+
+	struct bkey_format	format;
+
+	struct btree_node	*data;
+	void			*aux_data;
+
+	/*
+	 * Sets of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	set[MAX_BSETS];
+
+	struct btree_nr_keys	nr;
+	u16			sib_u64s[2];
+	u16			whiteout_u64s;
+	u16			uncompacted_whiteout_u64s;
+	u8			page_order;
+	u8			unpack_fn_len;
+
+	/*
+	 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
+	 * fails because the lock sequence number has changed - i.e. the
+	 * contents were modified - we can still relock the node if it's still
+	 * the one we want, without redoing the traversal
+	 */
+
+	/*
+	 * For asynchronous splits/interior node updates:
+	 * When we do a split, we allocate new child nodes and update the parent
+	 * node to point to them: we update the parent in memory immediately,
+	 * but then we must wait until the children have been written out before
+	 * the update to the parent can be written - this is a list of the
+	 * btree_updates that are blocking this node from being
+	 * written:
+	 */
+	struct list_head	write_blocked;
+
+	/*
+	 * Also for asynchronous splits/interior node updates:
+	 * If a btree node isn't reachable yet, we don't want to kick off
+	 * another write - because that write also won't yet be reachable and
+	 * marking it as completed before it's reachable would be incorrect:
+	 */
+	unsigned long		will_make_reachable;
+
+	struct btree_ob_ref	ob;
+
+	/* lru list */
+	struct list_head	list;
+
+	struct btree_write	writes[2];
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	bool			*expensive_debug_checks;
+#endif
+};
+
+struct btree_cache {
+	struct rhashtable	table;
+	bool			table_init_done;
+	/*
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct mutex		lock;
+	struct list_head	live;
+	struct list_head	freeable;
+	struct list_head	freed;
+
+	/* Number of elements in live + freeable lists */
+	unsigned		used;
+	unsigned		reserve;
+	struct shrinker		shrink;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
+	 */
+	struct task_struct	*alloc_lock;
+	struct closure_waitlist	alloc_wait;
+};
+
+struct btree_node_iter {
+	u8		is_extents;
+
+	struct btree_node_iter_set {
+		u16	k, end;
+	} data[MAX_BSETS];
+};
+
+enum btree_iter_type {
+	BTREE_ITER_KEYS,
+	BTREE_ITER_SLOTS,
+	BTREE_ITER_NODES,
+};
+
+#define BTREE_ITER_TYPE			((1 << 2) - 1)
+
+#define BTREE_ITER_INTENT		(1 << 2)
+#define BTREE_ITER_PREFETCH		(1 << 3)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS		(1 << 4)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF	(1 << 5)
+#define BTREE_ITER_ERROR		(1 << 6)
+
+enum btree_iter_uptodate {
+	BTREE_ITER_UPTODATE		= 0,
+	BTREE_ITER_NEED_PEEK		= 1,
+	BTREE_ITER_NEED_RELOCK		= 2,
+	BTREE_ITER_NEED_TRAVERSE	= 3,
+};
+
+/*
+ * @pos			- iterator's current position
+ * @level		- current btree depth
+ * @locks_want		- btree level below which we start taking intent locks
+ * @nodes_locked	- bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked	- bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+	struct bch_fs		*c;
+	struct bpos		pos;
+
+	u8			flags;
+	enum btree_iter_uptodate uptodate:4;
+	enum btree_id		btree_id:4;
+	unsigned		level:4,
+				locks_want:4,
+				nodes_locked:4,
+				nodes_intent_locked:4;
+
+	struct btree_iter_level {
+		struct btree	*b;
+		struct btree_node_iter iter;
+	}			l[BTREE_MAX_DEPTH];
+
+	u32			lock_seq[BTREE_MAX_DEPTH];
+
+	/*
+	 * Current unpacked key - so that bch2_btree_iter_next()/
+	 * bch2_btree_iter_next_slot() can correctly advance pos.
+	 */
+	struct bkey		k;
+
+	/*
+	 * Circular linked list of linked iterators: linked iterators share
+	 * locks (e.g. two linked iterators may have the same node intent
+	 * locked, or read and write locked, at the same time), and insertions
+	 * through one iterator won't invalidate the other linked iterators.
+	 */
+
+	/* Must come last: */
+	struct btree_iter	*next;
+};
+
+#define BTREE_ITER_MAX		8
+
+struct btree_insert_entry {
+	struct btree_iter *iter;
+	struct bkey_i	*k;
+	unsigned	extra_res;
+	/*
+	 * true if entire key was inserted - can only be false for
+	 * extents
+	 */
+	bool		done;
+};
+
+struct btree_trans {
+	struct bch_fs		*c;
+
+	u8			nr_iters;
+	u8			iters_live;
+	u8			iters_linked;
+	u8			nr_updates;
+
+	unsigned		mem_top;
+	unsigned		mem_bytes;
+	void			*mem;
+
+	struct btree_iter	*iters;
+	u64			iter_ids[BTREE_ITER_MAX];
+
+	struct btree_insert_entry updates[BTREE_ITER_MAX];
+
+	struct btree_iter	iters_onstack[2];
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+									\
+static inline void clear_btree_node_ ## flag(struct btree *b)		\
+{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
+
+enum btree_flags {
+	BTREE_NODE_read_in_flight,
+	BTREE_NODE_read_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_need_write,
+	BTREE_NODE_noevict,
+	BTREE_NODE_write_idx,
+	BTREE_NODE_accessed,
+	BTREE_NODE_write_in_flight,
+	BTREE_NODE_just_written,
+	BTREE_NODE_dying,
+	BTREE_NODE_fake,
+};
+
+BTREE_FLAG(read_in_flight);
+BTREE_FLAG(read_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(need_write);
+BTREE_FLAG(noevict);
+BTREE_FLAG(write_idx);
+BTREE_FLAG(accessed);
+BTREE_FLAG(write_in_flight);
+BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
+BTREE_FLAG(fake);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline struct bset_tree *bset_tree_last(struct btree *b)
+{
+	EBUG_ON(!b->nsets);
+	return b->set + b->nsets - 1;
+}
+
+static inline struct bset *bset(const struct btree *b,
+				const struct bset_tree *t)
+{
+	return (void *) b->data + t->data_offset * sizeof(u64);
+}
+
+static inline struct bset *btree_bset_first(struct btree *b)
+{
+	return bset(b, b->set);
+}
+
+static inline struct bset *btree_bset_last(struct btree *b)
+{
+	return bset(b, bset_tree_last(b));
+}
+
+static inline u16
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
+{
+	size_t ret = (u64 *) k - (u64 *) b->data - 1;
+
+	EBUG_ON(ret > U16_MAX);
+	return ret;
+}
+
+static inline struct bkey_packed *
+__btree_node_offset_to_key(const struct btree *b, u16 k)
+{
+	return (void *) ((u64 *) b->data + k + 1);
+}
+
+#define btree_bkey_first(_b, _t)	(bset(_b, _t)->start)
+
+#define btree_bkey_last(_b, _t)						\
+({									\
+	EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) !=	\
+		vstruct_last(bset(_b, _t)));				\
+									\
+	__btree_node_offset_to_key(_b, (_t)->end_offset);		\
+})
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+	t->end_offset =
+		__btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
+	btree_bkey_last(b, t);
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+				  const struct bset *i)
+{
+	t->data_offset = (u64 *) i - (u64 *) b->data;
+
+	EBUG_ON(bset(b, t) != i);
+
+	set_btree_bset_end(b, t);
+}
+
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
+{
+	return i - (void *) b->data;
+}
+
+/* Type of keys @b contains: */
+static inline enum bkey_type btree_node_type(struct btree *b)
+{
+	return b->level ? BKEY_TYPE_BTREE : b->btree_id;
+}
+
+static inline const struct bkey_ops *btree_node_ops(struct btree *b)
+{
+	return &bch2_bkey_ops[btree_node_type(b)];
+}
+
+static inline bool btree_node_has_ptrs(struct btree *b)
+{
+	return btree_type_has_ptrs(btree_node_type(b));
+}
+
+static inline bool btree_node_is_extents(struct btree *b)
+{
+	return btree_node_type(b) == BKEY_TYPE_EXTENTS;
+}
+
+struct btree_root {
+	struct btree		*b;
+
+	struct btree_update	*as;
+
+	/* On disk root - see async splits: */
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+	u8			level;
+	u8			alive;
+};
+
+/*
+ * Optional hook that will be called just prior to a btree node update, when
+ * we're holding the write lock and we know what key is about to be overwritten:
+ */
+
+struct btree_iter;
+struct btree_node_iter;
+
+enum btree_insert_ret {
+	BTREE_INSERT_OK,
+	/* extent spanned multiple leaf nodes: have to traverse to next node: */
+	BTREE_INSERT_NEED_TRAVERSE,
+	/* write lock held for too long */
+	BTREE_INSERT_NEED_RESCHED,
+	/* leaf node needs to be split */
+	BTREE_INSERT_BTREE_NODE_FULL,
+	BTREE_INSERT_JOURNAL_RES_FULL,
+	BTREE_INSERT_ENOSPC,
+	BTREE_INSERT_NEED_GC_LOCK,
+};
+
+struct extent_insert_hook {
+	enum btree_insert_ret
+	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+	      struct bkey_s_c, const struct bkey_i *);
+};
+
+enum btree_gc_coalesce_fail_reason {
+	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
+	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
+	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
+};
+
+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
+							struct btree *,
+							struct btree_node_iter *);
+
+#endif /* _BCACHEFS_BTREE_TYPES_H */
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_H
+#define _BCACHEFS_BTREE_UPDATE_H
+
+#include "btree_iter.h"
+#include "journal.h"
+
+struct bch_fs;
+struct btree;
+struct btree_insert;
+
+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
+				     struct btree_iter *);
+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
+				struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
+			    struct bkey_i *);
+
+/* Normal update interface: */
+
+struct btree_insert {
+	struct bch_fs		*c;
+	struct disk_reservation *disk_res;
+	struct journal_res	journal_res;
+	u64			*journal_seq;
+	struct extent_insert_hook *hook;
+	unsigned		flags;
+	bool			did_work;
+
+	unsigned short		nr;
+	struct btree_insert_entry  *entries;
+};
+
+int __bch2_btree_insert_at(struct btree_insert *);
+
+#define BTREE_INSERT_ENTRY(_iter, _k)					\
+	((struct btree_insert_entry) {					\
+		.iter		= (_iter),				\
+		.k		= (_k),					\
+		.done		= false,				\
+	})
+
+#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)			\
+	((struct btree_insert_entry) {					\
+		.iter		= (_iter),				\
+		.k		= (_k),					\
+		.extra_res = (_extra),					\
+		.done		= false,				\
+	})
+
+/**
+ * bch_btree_insert_at - insert one or more keys at iterator positions
+ * @iter:		btree iterator
+ * @insert_key:		key to insert
+ * @disk_res:		disk reservation
+ * @hook:		extent insert callback
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+#define bch2_btree_insert_at(_c, _disk_res, _hook,			\
+			    _journal_seq, _flags, ...)			\
+	__bch2_btree_insert_at(&(struct btree_insert) {			\
+		.c		= (_c),					\
+		.disk_res	= (_disk_res),				\
+		.journal_seq	= (_journal_seq),			\
+		.hook		= (_hook),				\
+		.flags		= (_flags),				\
+		.nr		= COUNT_ARGS(__VA_ARGS__),		\
+		.entries	= (struct btree_insert_entry[]) {	\
+			__VA_ARGS__					\
+		}})
+
+enum {
+	__BTREE_INSERT_ATOMIC,
+	__BTREE_INSERT_NOUNLOCK,
+	__BTREE_INSERT_NOFAIL,
+	__BTREE_INSERT_USE_RESERVE,
+	__BTREE_INSERT_USE_ALLOC_RESERVE,
+	__BTREE_INSERT_JOURNAL_REPLAY,
+	__BTREE_INSERT_NOWAIT,
+	__BTREE_INSERT_GC_LOCK_HELD,
+	__BCH_HASH_SET_MUST_CREATE,
+	__BCH_HASH_SET_MUST_REPLACE,
+};
+
+/*
+ * Don't drop/retake locks before doing btree update, instead return -EINTR if
+ * we had to drop locks for any reason
+ */
+#define BTREE_INSERT_ATOMIC		(1 << __BTREE_INSERT_ATOMIC)
+
+/*
+ * Don't drop locks _after_ successfully updating btree:
+ */
+#define BTREE_INSERT_NOUNLOCK		(1 << __BTREE_INSERT_NOUNLOCK)
+
+/* Don't check for -ENOSPC: */
+#define BTREE_INSERT_NOFAIL		(1 << __BTREE_INSERT_NOFAIL)
+
+/* for copygc, or when merging btree nodes */
+#define BTREE_INSERT_USE_RESERVE	(1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_USE_ALLOC_RESERVE	(1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
+
+/*
+ * Insert is for journal replay: don't get journal reservations, or mark extents
+ * (bch_mark_key)
+ */
+#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
+
+/* Don't block on allocation failure (for new btree nodes: */
+#define BTREE_INSERT_NOWAIT		(1 << __BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD	(1 << __BTREE_INSERT_GC_LOCK_HELD)
+
+#define BCH_HASH_SET_MUST_CREATE	(1 << __BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE	(1 << __BCH_HASH_SET_MUST_REPLACE)
+
+int bch2_btree_delete_at(struct btree_iter *, unsigned);
+
+int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
+			     struct disk_reservation *,
+			     struct extent_insert_hook *, u64 *, unsigned);
+
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
+		     struct disk_reservation *,
+		     struct extent_insert_hook *, u64 *, int flags);
+
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
+			   struct bpos, struct bpos, struct bversion,
+			   struct disk_reservation *,
+			   struct extent_insert_hook *, u64 *);
+
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
+			    __le64, unsigned);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+			       struct btree *, struct bkey_i_extent *);
+
+/* new transactional interface: */
+
+void bch2_trans_update(struct btree_trans *, struct btree_iter *,
+			     struct bkey_i *, unsigned);
+int bch2_trans_commit(struct btree_trans *,
+		      struct disk_reservation *,
+		      struct extent_insert_hook *,
+		      u64 *, unsigned);
+
+#define bch2_trans_do(_c, _journal_seq, _flags, _do)			\
+({									\
+	struct btree_trans trans;					\
+	int _ret;							\
+									\
+	bch2_trans_init(&trans, (_c));					\
+									\
+	do {								\
+		bch2_trans_begin(&trans);				\
+									\
+		_ret = (_do) ?:	bch2_trans_commit(&trans, NULL, NULL,	\
+					(_journal_seq), (_flags));	\
+	} while (_ret == -EINTR);					\
+									\
+	bch2_trans_exit(&trans);					\
+	_ret;								\
+})
+
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@ -0,0 +1,374 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_locking.h"
+#include "btree_update.h"
+
+struct btree_reserve {
+	struct disk_reservation	disk_res;
+	unsigned		nr;
+	struct btree		*b[BTREE_RESERVE_MAX];
+};
+
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
+				struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+	bool			index_update_done;
+
+	__le64			seq;
+	enum btree_id		btree_id;
+	unsigned		level;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+	struct closure			cl;
+	struct bch_fs			*c;
+
+	struct list_head		list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+
+	unsigned			must_rewrite:1;
+	unsigned			nodes_written:1;
+
+	enum btree_id			btree_id;
+
+	struct btree_reserve		*reserve;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+	 * we're now blocking another btree_update
+	 * @parent_as - btree_update that's waiting on our nodes to finish
+	 * writing, before it can make new nodes visible on disk
+	 * @wait - list of child btree_updates that are waiting on this
+	 * btree_update to make all the new nodes visible before they can free
+	 * their old btree nodes
+	 */
+	struct btree_update		*parent_as;
+	struct closure_waitlist		wait;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	u64				journal_seq;
+
+	/*
+	 * Nodes being freed:
+	 * Protected by c->btree_node_pending_free_lock
+	 */
+	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+	unsigned			nr_pending;
+
+	/* New nodes, that will be made reachable by this update: */
+	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	unsigned			nr_new_nodes;
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define for_each_pending_btree_node_free(c, as, p)			\
+	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
+		for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
+				struct btree_iter *);
+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
+void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree *,
+						  struct bkey_format);
+
+void bch2_btree_update_done(struct btree_update *);
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+			unsigned, struct closure *);
+
+void bch2_btree_interior_update_will_free_node(struct btree_update *,
+					       struct btree *);
+
+void bch2_btree_insert_node(struct btree_update *, struct btree *,
+			    struct btree_iter *, struct keylist *,
+			    unsigned);
+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+
+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+				   unsigned, unsigned, enum btree_node_sibling);
+
+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+					struct btree_iter *iter,
+					unsigned level, unsigned flags,
+					enum btree_node_sibling sib)
+{
+	struct btree *b;
+
+	/*
+	 * iterators are inconsistent when they hit end of leaf, until
+	 * traversed again
+	 *
+	 * XXX inconsistent how?
+	 */
+	if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
+		return;
+
+	if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+		return;
+
+	if (!bch2_btree_node_relock(iter, level))
+		return;
+
+	b = iter->l[level].b;
+	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
+		return;
+
+	__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
+}
+
+static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
+					       struct btree_iter *iter,
+					       unsigned level,
+					       unsigned flags)
+{
+	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+					    btree_prev_sib);
+	bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+					    btree_next_sib);
+}
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+						     struct btree *b)
+{
+	unsigned depth = btree_node_root(c, b)->level + 1;
+
+	/*
+	 * Number of nodes we might have to allocate in a worst case btree
+	 * split operation - we split all the way up to the root, then allocate
+	 * a new root, unless we're already at max depth:
+	 */
+	if (depth < BTREE_MAX_DEPTH)
+		return (depth - b->level) * 2 + 1;
+	else
+		return (depth - b->level) * 2 - 1;
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return (void *) i < write_block(b);
+}
+
+static inline bool bset_unwritten(struct btree *b, struct bset *i)
+{
+	return (void *) i > write_block(b);
+}
+
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+						 struct btree *b,
+						 void *end)
+{
+	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	ssize_t total = c->opts.btree_node_size << 6;
+
+	return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+				btree_bkey_last(b, bset_tree_last(b)));
+
+	BUG_ON(remaining < 0);
+
+	if (bset_written(b, btree_bset_last(b)))
+		return 0;
+
+	return remaining;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+						     struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	struct btree_node_entry *bne = max(write_block(b),
+			(void *) btree_bkey_last(b, bset_tree_last(b)));
+	ssize_t remaining_space =
+		__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+	if (unlikely(bset_written(b, i))) {
+		if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+			return bne;
+	} else {
+		if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+		    remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+			return bne;
+	}
+
+	return NULL;
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+				      struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		EBUG_ON(b->uncompacted_whiteout_u64s <
+			bkeyp_key_u64s(&b->format, k));
+		b->uncompacted_whiteout_u64s -=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+				    struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		BUG_ON(!k->needs_whiteout);
+		b->uncompacted_whiteout_u64s +=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+					      struct btree *b, unsigned u64s)
+{
+	if (unlikely(btree_node_fake(b)))
+		return false;
+
+	if (btree_node_is_extents(b)) {
+		/* The insert key might split an existing key
+		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
+		 */
+		u64s += BKEY_EXTENT_U64s_MAX;
+	}
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+static inline bool journal_res_insert_fits(struct btree_insert *trans,
+					   struct btree_insert_entry *insert)
+{
+	unsigned u64s = 0;
+	struct btree_insert_entry *i;
+
+	/*
+	 * If we didn't get a journal reservation, we're in journal replay and
+	 * we're not journalling updates:
+	 */
+	if (!trans->journal_res.ref)
+		return true;
+
+	for (i = insert; i < trans->entries + trans->nr; i++)
+		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+	return u64s <= trans->journal_res.u64s;
+}
+
+ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "trace.h"
+
+#include <linux/sort.h>
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	const struct bkey_format *f = &b->format;
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	unsigned clobber_u64s;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+		BUG_ON(bkey_whiteout(k));
+
+		t = bch2_bkey_to_bset(b, k);
+
+		if (bset_unwritten(b, bset(b, t)) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
+		    !bkey_whiteout(&insert->k)) {
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
+		}
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+
+		btree_keys_account_key_drop(&b->nr, t - b->set, k);
+
+		if (t == bset_tree_last(b)) {
+			clobber_u64s = k->u64s;
+
+			/*
+			 * If we're deleting, and the key we're deleting doesn't
+			 * need a whiteout (it wasn't overwriting a key that had
+			 * been written to disk) - just delete it:
+			 */
+			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+				bch2_bset_delete(b, k, clobber_u64s);
+				bch2_btree_node_iter_fix(iter, b, node_iter, t,
+							k, clobber_u64s, 0);
+				return true;
+			}
+
+			goto overwrite;
+		}
+
+		k->type = KEY_TYPE_DELETED;
+		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+					k->u64s, k->u64s);
+
+		if (bkey_whiteout(&insert->k)) {
+			reserve_whiteout(b, t, k);
+			return true;
+		} else {
+			k->needs_whiteout = false;
+		}
+	} else {
+		/*
+		 * Deleting, but the key to delete wasn't found - nothing to do:
+		 */
+		if (bkey_whiteout(&insert->k))
+			return false;
+
+		insert->k.needs_whiteout = false;
+	}
+
+	t = bset_tree_last(b);
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+	clobber_u64s = 0;
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
+		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+					clobber_u64s, k->u64s);
+	return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+
+	btree_node_lock_type(c, b, SIX_LOCK_read);
+	bch2_btree_node_write_cond(c, b,
+			(btree_current_write(b) == w &&
+			 w->journal.pin_list == journal_seq_pin(j, seq)));
+	six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+void bch2_btree_journal_key(struct btree_insert *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree *b = iter->l[0].b;
+	struct btree_write *w = btree_current_write(b);
+
+	EBUG_ON(iter->level || b->level);
+	EBUG_ON(trans->journal_res.ref !=
+		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		u64 seq = trans->journal_res.seq;
+		bool needs_whiteout = insert->k.needs_whiteout;
+
+		/* ick */
+		insert->k.needs_whiteout = false;
+		bch2_journal_add_keys(j, &trans->journal_res,
+				      iter->btree_id, insert);
+		insert->k.needs_whiteout = needs_whiteout;
+
+		bch2_journal_set_has_inode(j, &trans->journal_res,
+					   insert->k.p.inode);
+
+		if (trans->journal_seq)
+			*trans->journal_seq = seq;
+		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+	}
+
+	if (unlikely(!journal_pin_active(&w->journal))) {
+		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq;
+
+		bch2_journal_pin_add(j, seq, &w->journal,
+				     btree_node_write_idx(b) == 0
+				     ? btree_node_flush0
+				     : btree_node_flush1);
+	}
+
+	if (unlikely(!btree_node_dirty(b)))
+		set_btree_node_dirty(b);
+}
+
+static enum btree_insert_ret
+bch2_insert_fixup_key(struct btree_insert *trans,
+		     struct btree_insert_entry *insert)
+{
+	struct btree_iter *iter = insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+
+	EBUG_ON(iter->level);
+	EBUG_ON(insert->k->k.u64s >
+		bch_btree_keys_u64s_remaining(trans->c, l->b));
+
+	if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+				       insert->k))
+		bch2_btree_journal_key(trans, iter, insert->k);
+
+	trans->did_work = true;
+	return BTREE_INSERT_OK;
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static enum btree_insert_ret
+btree_insert_key_leaf(struct btree_insert *trans,
+		      struct btree_insert_entry *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->l[0].b;
+	enum btree_insert_ret ret;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	ret = !btree_node_is_extents(b)
+		? bch2_insert_fixup_key(trans, insert)
+		: bch2_insert_fixup_extent(trans, insert);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	trace_btree_insert_key(c, b, insert->k);
+	return ret;
+}
+
+#define trans_for_each_entry(trans, i)					\
+	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+/*
+ * We sort transaction entries so that if multiple iterators point to the same
+ * leaf node they'll be adjacent:
+ */
+static bool same_leaf_as_prev(struct btree_insert *trans,
+			      struct btree_insert_entry *i)
+{
+	return i != trans->entries &&
+		i[0].iter->l[0].b == i[-1].iter->l[0].b;
+}
+
+static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
+							 struct btree_insert_entry *i)
+{
+	struct btree *b = i->iter->l[0].b;
+
+	do {
+		i++;
+	} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+
+	return i;
+}
+
+#define trans_for_each_leaf(trans, i)					\
+	for ((i) = (trans)->entries;					\
+	     (i) < (trans)->entries + (trans)->nr;			\
+	     (i) = trans_next_leaf(trans, i))
+
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
+					    struct btree_iter *iter)
+{
+	bch2_btree_node_lock_write(b, iter);
+
+	if (btree_node_just_written(b) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(c, b, iter);
+}
+
+static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_leaf(trans, i)
+		bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
+}
+
+static void multi_unlock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_leaf(trans, i)
+		bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+}
+
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+				  struct btree_insert_entry r)
+{
+	return btree_iter_cmp(l.iter, r.iter);
+}
+
+/* Normal update interface: */
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_btree_insert_at(struct btree_insert *trans,
+				     struct btree_iter **split,
+				     bool *cycle_gc_lock)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	unsigned u64s;
+	int ret;
+
+	trans_for_each_entry(trans, i) {
+		BUG_ON(i->done);
+		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+	}
+
+	u64s = 0;
+	trans_for_each_entry(trans, i)
+		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
+		? bch2_journal_res_get(&c->journal,
+				      &trans->journal_res,
+				      u64s, u64s)
+		: 0;
+	if (ret)
+		return ret;
+
+	multi_lock_write(c, trans);
+
+	if (race_fault()) {
+		ret = -EINTR;
+		goto out;
+	}
+
+	u64s = 0;
+	trans_for_each_entry(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		/*
+		 * bch2_btree_node_insert_fits() must be called under write lock:
+		 * with only an intent lock, another thread can still call
+		 * bch2_btree_node_write(), converting an unwritten bset to a
+		 * written one
+		 */
+		u64s += i->k->k.u64s + i->extra_res;
+		if (!bch2_btree_node_insert_fits(c,
+				i->iter->l[0].b, u64s)) {
+			ret = -EINTR;
+			*split = i->iter;
+			goto out;
+		}
+	}
+
+	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+		if (journal_seq_verify(c))
+			trans_for_each_entry(trans, i)
+				i->k->k.version.lo = trans->journal_res.seq;
+		else if (inject_invalid_keys(c))
+			trans_for_each_entry(trans, i)
+				i->k->k.version = MAX_VERSION;
+	}
+
+	trans_for_each_entry(trans, i) {
+		switch (btree_insert_key_leaf(trans, i)) {
+		case BTREE_INSERT_OK:
+			i->done = true;
+			break;
+		case BTREE_INSERT_JOURNAL_RES_FULL:
+		case BTREE_INSERT_NEED_TRAVERSE:
+		case BTREE_INSERT_NEED_RESCHED:
+			ret = -EINTR;
+			break;
+		case BTREE_INSERT_BTREE_NODE_FULL:
+			ret = -EINTR;
+			*split = i->iter;
+			break;
+		case BTREE_INSERT_ENOSPC:
+			ret = -ENOSPC;
+			break;
+		case BTREE_INSERT_NEED_GC_LOCK:
+			ret = -EINTR;
+			*cycle_gc_lock = true;
+			break;
+		default:
+			BUG();
+		}
+
+		/*
+		 * If we did some work (i.e. inserted part of an extent),
+		 * we have to do all the other updates as well:
+		 */
+		if (!trans->did_work && (ret || *split))
+			break;
+	}
+out:
+	multi_unlock_write(trans);
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	return ret;
+}
+
+static inline void btree_insert_entry_checks(struct bch_fs *c,
+					     struct btree_insert_entry *i)
+{
+	BUG_ON(i->iter->level);
+	BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+	BUG_ON(debug_check_bkeys(c) &&
+	       !bkey_deleted(&i->k->k) &&
+	       bch2_bkey_invalid(c, (enum bkey_type) i->iter->btree_id,
+				 bkey_i_to_s_c(i->k)));
+}
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch2_btree_insert_at(struct btree_insert *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_iter *linked, *split = NULL;
+	bool cycle_gc_lock = false;
+	unsigned flags;
+	int ret;
+
+	BUG_ON(!trans->nr);
+
+	for_each_btree_iter(trans->entries[0].iter, linked)
+		bch2_btree_iter_verify_locks(linked);
+
+	/* for the sake of sanity: */
+	BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+
+	trans_for_each_entry(trans, i)
+		btree_insert_entry_checks(c, i);
+
+	bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+retry:
+	split = NULL;
+	cycle_gc_lock = false;
+
+	trans_for_each_entry(trans, i) {
+		if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
+			ret = -EINTR;
+			goto err;
+		}
+
+		if (i->iter->flags & BTREE_ITER_ERROR) {
+			ret = -EIO;
+			goto err;
+		}
+	}
+
+	ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
+	if (unlikely(ret))
+		goto err;
+
+	trans_for_each_leaf(trans, i)
+		bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+
+	trans_for_each_entry(trans, i)
+		bch2_btree_iter_downgrade(i->iter);
+out:
+	percpu_ref_put(&c->writes);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		/* make sure we didn't drop or screw up locks: */
+		for_each_btree_iter(trans->entries[0].iter, linked) {
+			bch2_btree_iter_verify_locks(linked);
+			BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
+			       trans->did_work &&
+			       linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+		}
+
+		/* make sure we didn't lose an error: */
+		if (!ret)
+			trans_for_each_entry(trans, i)
+				BUG_ON(!i->done);
+	}
+
+	BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
+	return ret;
+err:
+	flags = trans->flags;
+
+	/*
+	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
+	 * update; if we haven't done anything yet it doesn't apply
+	 */
+	if (!trans->did_work)
+		flags &= ~BTREE_INSERT_NOUNLOCK;
+
+	if (split) {
+		ret = bch2_btree_split_leaf(c, split, flags);
+
+		/*
+		 * if the split succeeded without dropping locks the insert will
+		 * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
+		 * caller peeked() and is overwriting won't have changed)
+		 */
+#if 0
+		/*
+		 * XXX:
+		 * split -> btree node merging (of parent node) might still drop
+		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+		 */
+		if (!ret && !trans->did_work)
+			goto retry;
+#endif
+
+		/*
+		 * don't care if we got ENOSPC because we told split it
+		 * couldn't block:
+		 */
+		if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+			ret = -EINTR;
+	}
+
+	if (cycle_gc_lock) {
+		if (!down_read_trylock(&c->gc_lock)) {
+			if (flags & BTREE_INSERT_NOUNLOCK)
+				goto out;
+
+			bch2_btree_iter_unlock(trans->entries[0].iter);
+			down_read(&c->gc_lock);
+		}
+		up_read(&c->gc_lock);
+	}
+
+	if (ret == -EINTR) {
+		if (flags & BTREE_INSERT_NOUNLOCK)
+			goto out;
+
+		trans_for_each_entry(trans, i) {
+			int ret2 = bch2_btree_iter_traverse(i->iter);
+			if (ret2) {
+				ret = ret2;
+				goto out;
+			}
+
+			BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
+		}
+
+		/*
+		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+		 * dropped locks:
+		 */
+		if (!(flags & BTREE_INSERT_ATOMIC))
+			goto retry;
+	}
+
+	goto out;
+}
+
+void bch2_trans_update(struct btree_trans *trans,
+		       struct btree_iter *iter,
+		       struct bkey_i *k,
+		       unsigned extra_journal_res)
+{
+	struct btree_insert_entry *i;
+
+	BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+	i = &trans->updates[trans->nr_updates++];
+
+	*i = (struct btree_insert_entry) {
+		.iter	= iter,
+		.k		= k,
+		.extra_res	= extra_journal_res,
+	};
+
+	btree_insert_entry_checks(trans->c, i);
+}
+
+int bch2_trans_commit(struct btree_trans *trans,
+		      struct disk_reservation *disk_res,
+		      struct extent_insert_hook *hook,
+		      u64 *journal_seq,
+		      unsigned flags)
+{
+	struct btree_insert insert = {
+		.c		= trans->c,
+		.disk_res	= disk_res,
+		.journal_seq	= journal_seq,
+		.flags		= flags,
+		.nr		= trans->nr_updates,
+		.entries	= trans->updates,
+	};
+
+	if (!trans->nr_updates)
+		return 0;
+
+	trans->nr_updates = 0;
+
+	return __bch2_btree_insert_at(&insert);
+}
+
+int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.p = iter->pos;
+
+	return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE|flags,
+				    BTREE_INSERT_ENTRY(iter, &k));
+}
+
+int bch2_btree_insert_list_at(struct btree_iter *iter,
+			     struct keylist *keys,
+			     struct disk_reservation *disk_res,
+			     struct extent_insert_hook *hook,
+			     u64 *journal_seq, unsigned flags)
+{
+	BUG_ON(flags & BTREE_INSERT_ATOMIC);
+	BUG_ON(bch2_keylist_empty(keys));
+	bch2_verify_keylist_sorted(keys);
+
+	while (!bch2_keylist_empty(keys)) {
+		int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+				journal_seq, flags,
+				BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
+		if (ret)
+			return ret;
+
+		bch2_keylist_pop_front(keys);
+	}
+
+	return 0;
+}
+
+/**
+ * bch_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+		     struct bkey_i *k,
+		     struct disk_reservation *disk_res,
+		     struct extent_insert_hook *hook,
+		     u64 *journal_seq, int flags)
+{
+	struct btree_iter iter;
+	int ret;
+
+	bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+	ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+				   BTREE_INSERT_ENTRY(&iter, k));
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			   struct bpos start,
+			   struct bpos end,
+			   struct bversion version,
+			   struct disk_reservation *disk_res,
+			   struct extent_insert_hook *hook,
+			   u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, id, start,
+			     BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+		/* really shouldn't be using a bare, unpadded bkey_i */
+		struct bkey_i delete;
+
+		if (bkey_cmp(iter.pos, end) >= 0)
+			break;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+		delete.k.version = version;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
+			/* create the biggest key we can */
+			bch2_key_resize(&delete.k, max_sectors);
+			bch2_cut_back(end, &delete.k);
+		}
+
+		ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &delete));
+		if (ret)
+			break;
+
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ *
+ * Bucket states:
+ * - free bucket: mark == 0
+ *   The bucket contains no data and will not be read
+ *
+ * - allocator bucket: owned_by_allocator == 1
+ *   The bucket is on a free list, or it is an open bucket
+ *
+ * - cached bucket: owned_by_allocator == 0 &&
+ *                  dirty_sectors == 0 &&
+ *                  cached_sectors > 0
+ *   The bucket contains data but may be safely discarded as there are
+ *   enough replicas of the data on other cache devices, or it has been
+ *   written back to the backing device
+ *
+ * - dirty bucket: owned_by_allocator == 0 &&
+ *                 dirty_sectors > 0
+ *   The bucket contains data that we must not discard (either only copy,
+ *   or one of the 'main copies' for data requiring multiple replicas)
+ *
+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
+ *   This is a btree node, journal or gen/prio bucket
+ *
+ * Lifecycle:
+ *
+ * bucket invalidated => bucket on freelist => open bucket =>
+ *     [dirty bucket =>] cached bucket => bucket invalidated => ...
+ *
+ * Note that cache promotion can skip the dirty bucket step, as data
+ * is copied from a deeper tier to a shallower tier, onto a cached
+ * bucket.
+ * Note also that a cached bucket can spontaneously become dirty --
+ * see below.
+ *
+ * Only a traversal of the key space can determine whether a bucket is
+ * truly dirty or cached.
+ *
+ * Transitions:
+ *
+ * - free => allocator: bucket was invalidated
+ * - cached => allocator: bucket was invalidated
+ *
+ * - allocator => dirty: open bucket was filled up
+ * - allocator => cached: open bucket was filled up
+ * - allocator => metadata: metadata was allocated
+ *
+ * - dirty => cached: dirty sectors were copied to a deeper tier
+ * - dirty => free: dirty sectors were overwritten or moved (copy gc)
+ * - cached => free: cached sectors were overwritten
+ *
+ * - metadata => free: metadata was freed
+ *
+ * Oddities:
+ * - cached => dirty: a device was removed so formerly replicated data
+ *                    is no longer sufficiently replicated
+ * - free => cached: cannot happen
+ * - free => dirty: cannot happen
+ * - free => metadata: cannot happen
+ */
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "buckets.h"
+#include "error.h"
+#include "movinggc.h"
+#include "trace.h"
+
+#include <linux/preempt.h>
+
+#ifdef DEBUG_BUCKETS
+
+#define lg_local_lock	lg_global_lock
+#define lg_local_unlock	lg_global_unlock
+
+static void bch2_fs_stats_verify(struct bch_fs *c)
+{
+	struct bch_fs_usage stats =
+		__bch2_fs_usage_read(c);
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
+		if ((s64) stats.s[i].data[S_META] < 0)
+			panic("replicas %u meta underflow: %lli\n",
+			      i + 1, stats.s[i].data[S_META]);
+
+		if ((s64) stats.s[i].data[S_DIRTY] < 0)
+			panic("replicas %u dirty underflow: %lli\n",
+			      i + 1, stats.s[i].data[S_DIRTY]);
+
+		if ((s64) stats.s[i].persistent_reserved < 0)
+			panic("replicas %u reserved underflow: %lli\n",
+			      i + 1, stats.s[i].persistent_reserved);
+	}
+
+	if ((s64) stats.online_reserved < 0)
+		panic("sectors_online_reserved underflow: %lli\n",
+		      stats.online_reserved);
+}
+
+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+	struct bch_dev_usage stats =
+		__bch2_dev_usage_read(ca);
+	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
+		BUG_ON(stats.buckets[i]		> n);
+	BUG_ON(stats.buckets_alloc		> n);
+	BUG_ON(stats.buckets_unavailable	> n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		u64 used = __bch2_fs_sectors_used(c);
+		u64 cached = 0;
+		u64 avail = atomic64_read(&c->sectors_available);
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+		if (used + avail + cached > c->capacity)
+			panic("used %llu avail %llu cached %llu capacity %llu\n",
+			      used, avail, cached, c->capacity);
+	}
+}
+
+#else
+
+static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
+
+#endif
+
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
+void bch2_bucket_seq_cleanup(struct bch_fs *c)
+{
+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
+	struct bch_dev *ca;
+	struct bucket_array *buckets;
+	struct bucket *g;
+	struct bucket_mark m;
+	unsigned i;
+
+	for_each_member_device(ca, c, i) {
+		down_read(&ca->bucket_lock);
+		buckets = bucket_array(ca);
+
+		for_each_bucket(g, buckets) {
+			bucket_cmpxchg(g, m, ({
+				if (!m.journal_seq_valid ||
+				    bucket_needs_journal_commit(m, last_seq_ondisk))
+					break;
+
+				m.journal_seq_valid = 0;
+			}));
+		}
+		up_read(&ca->bucket_lock);
+	}
+}
+
+#define bch2_usage_add(_acc, _stats)					\
+do {									\
+	typeof(_acc) _a = (_acc), _s = (_stats);			\
+	unsigned i;							\
+									\
+	for (i = 0; i < sizeof(*_a) / sizeof(u64); i++)			\
+		((u64 *) (_a))[i] += ((u64 *) (_s))[i];			\
+} while (0)
+
+#define bch2_usage_read_raw(_stats)					\
+({									\
+	typeof(*this_cpu_ptr(_stats)) _acc;				\
+	int cpu;							\
+									\
+	memset(&_acc, 0, sizeof(_acc));					\
+									\
+	for_each_possible_cpu(cpu)					\
+		bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));	\
+									\
+	_acc;								\
+})
+
+#define bch2_usage_read_cached(_c, _cached, _uncached)			\
+({									\
+	typeof(_cached) _ret;						\
+	unsigned _seq;							\
+									\
+	do {								\
+		_seq = read_seqcount_begin(&(_c)->gc_pos_lock);		\
+		_ret = (_c)->gc_pos.phase == GC_PHASE_DONE		\
+			? bch2_usage_read_raw(_uncached)			\
+			: (_cached);					\
+	} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));	\
+									\
+	_ret;								\
+})
+
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+{
+	return bch2_usage_read_raw(ca->usage_percpu);
+}
+
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
+{
+	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+}
+
+struct bch_fs_usage
+__bch2_fs_usage_read(struct bch_fs *c)
+{
+	return bch2_usage_read_raw(c->usage_percpu);
+}
+
+struct bch_fs_usage
+bch2_fs_usage_read(struct bch_fs *c)
+{
+	return bch2_usage_read_cached(c,
+				     c->usage_cached,
+				     c->usage_percpu);
+}
+
+struct fs_usage_sum {
+	u64	data;
+	u64	reserved;
+};
+
+static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
+{
+	struct fs_usage_sum sum = { 0 };
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
+		sum.data += (stats.s[i].data[S_META] +
+			     stats.s[i].data[S_DIRTY]) * (i + 1);
+		sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+	}
+
+	sum.reserved += stats.online_reserved;
+	return sum;
+}
+
+#define RESERVE_FACTOR	6
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
+static u64 avail_factor(u64 r)
+{
+	return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+}
+
+u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	struct fs_usage_sum sum = __fs_usage_sum(stats);
+
+	return sum.data + reserve_factor(sum.reserved);
+}
+
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	return min(c->capacity, __bch2_fs_sectors_used(c, stats));
+}
+
+u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+{
+	return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+}
+
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+	return !is_available_bucket(m);
+}
+
+static inline int is_fragmented_bucket(struct bucket_mark m,
+				       struct bch_dev *ca)
+{
+	if (!m.owned_by_allocator &&
+	    m.data_type == BCH_DATA_USER &&
+	    bucket_sectors_used(m))
+		return max_t(int, 0, (int) ca->mi.bucket_size -
+			     bucket_sectors_used(m));
+	return 0;
+}
+
+static inline enum bch_data_type bucket_type(struct bucket_mark m)
+{
+	return m.cached_sectors && !m.dirty_sectors
+		?  BCH_DATA_CACHED
+		: m.data_type;
+}
+
+static bool bucket_became_unavailable(struct bch_fs *c,
+				      struct bucket_mark old,
+				      struct bucket_mark new)
+{
+	return is_available_bucket(old) &&
+	       !is_available_bucket(new) &&
+	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
+}
+
+void bch2_fs_usage_apply(struct bch_fs *c,
+			struct bch_fs_usage *stats,
+			struct disk_reservation *disk_res,
+			struct gc_pos gc_pos)
+{
+	struct fs_usage_sum sum = __fs_usage_sum(*stats);
+	s64 added = sum.data + sum.reserved;
+
+	/*
+	 * Not allowed to reduce sectors_available except by getting a
+	 * reservation:
+	 */
+	BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
+
+	if (added > 0) {
+		disk_res->sectors	-= added;
+		stats->online_reserved	-= added;
+	}
+
+	percpu_down_read(&c->usage_lock);
+	preempt_disable();
+	/* online_reserved not subject to gc: */
+	this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
+	stats->online_reserved = 0;
+
+	if (!gc_will_visit(c, gc_pos))
+		bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+
+	bch2_fs_stats_verify(c);
+	preempt_enable();
+	percpu_up_read(&c->usage_lock);
+
+	memset(stats, 0, sizeof(*stats));
+}
+
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bucket_mark old, struct bucket_mark new)
+{
+	struct bch_dev_usage *dev_usage;
+
+	if (c)
+		percpu_rwsem_assert_held(&c->usage_lock);
+
+	if (old.data_type && new.data_type &&
+	    old.data_type != new.data_type) {
+		BUG_ON(!c);
+		bch2_fs_inconsistent(c,
+			"different types of data in same bucket: %s, %s",
+			bch2_data_types[old.data_type],
+			bch2_data_types[new.data_type]);
+	}
+
+	preempt_disable();
+	dev_usage = this_cpu_ptr(ca->usage_percpu);
+
+	dev_usage->buckets[bucket_type(old)]--;
+	dev_usage->buckets[bucket_type(new)]++;
+
+	dev_usage->buckets_alloc +=
+		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_unavailable +=
+		is_unavailable_bucket(new) - is_unavailable_bucket(old);
+
+	dev_usage->sectors[old.data_type] -= old.dirty_sectors;
+	dev_usage->sectors[new.data_type] += new.dirty_sectors;
+	dev_usage->sectors[BCH_DATA_CACHED] +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
+	dev_usage->sectors_fragmented +=
+		is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
+	preempt_enable();
+
+	if (!is_available_bucket(old) && is_available_bucket(new))
+		bch2_wake_allocator(ca);
+
+	bch2_dev_stats_verify(ca);
+}
+
+#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
+({								\
+	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
+								\
+	bch2_dev_usage_update(c, ca, _old, new);		\
+	_old;							\
+})
+
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, struct bucket_mark *old)
+{
+	struct bucket *g;
+	struct bucket_mark new;
+
+	percpu_rwsem_assert_held(&c->usage_lock);
+
+	g = bucket(ca, b);
+
+	*old = bucket_data_cmpxchg(c, ca, g, new, ({
+		if (!is_available_bucket(new))
+			return false;
+
+		new.owned_by_allocator	= 1;
+		new.data_type		= 0;
+		new.cached_sectors	= 0;
+		new.dirty_sectors	= 0;
+		new.gen++;
+	}));
+
+	if (!old->owned_by_allocator && old->cached_sectors)
+		trace_invalidate(ca, bucket_to_sector(ca, b),
+				 old->cached_sectors);
+	return true;
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    size_t b, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark old, new;
+
+	percpu_rwsem_assert_held(&c->usage_lock);
+	g = bucket(ca, b);
+
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		return;
+
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
+		new.owned_by_allocator	= owned_by_allocator;
+	}));
+
+	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
+	       c->gc_pos.phase == GC_PHASE_DONE);
+}
+
+#define saturated_add(ca, dst, src, max)			\
+do {								\
+	BUG_ON((int) (dst) + (src) < 0);			\
+	if ((dst) == (max))					\
+		;						\
+	else if ((dst) + (src) <= (max))			\
+		dst += (src);					\
+	else {							\
+		dst = (max);					\
+		trace_sectors_saturated(ca);		\
+	}							\
+} while (0)
+
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       size_t b, enum bch_data_type type,
+			       unsigned sectors, struct gc_pos pos,
+			       unsigned flags)
+{
+	struct bucket *g;
+	struct bucket_mark old, new;
+
+	BUG_ON(!type);
+
+	if (likely(c)) {
+		percpu_rwsem_assert_held(&c->usage_lock);
+
+		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+		    gc_will_visit(c, pos))
+			return;
+	}
+
+	rcu_read_lock();
+
+	g = bucket(ca, b);
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
+		saturated_add(ca, new.dirty_sectors, sectors,
+			      GC_MAX_SECTORS_USED);
+		new.data_type		= type;
+	}));
+
+	rcu_read_unlock();
+
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
+}
+
+/* Reverting this until the copygc + compression issue is fixed: */
+
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
+{
+	if (!sectors)
+		return 0;
+
+	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+				    crc.uncompressed_size));
+}
+
+/*
+ * Checking against gc's position has to be done here, inside the cmpxchg()
+ * loop, to avoid racing with the start of gc clearing all the marks - GC does
+ * that with the gc pos seqlock held.
+ */
+static void bch2_mark_pointer(struct bch_fs *c,
+			      struct bkey_s_c_extent e,
+			      const struct bch_extent_ptr *ptr,
+			      struct bch_extent_crc_unpacked crc,
+			      s64 sectors, enum s_alloc type,
+			      struct bch_fs_usage *stats,
+			      u64 journal_seq, unsigned flags)
+{
+	struct bucket_mark old, new;
+	unsigned saturated;
+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+	struct bucket *g = PTR_BUCKET(ca, ptr);
+	enum bch_data_type data_type = type == S_META
+		? BCH_DATA_BTREE : BCH_DATA_USER;
+	u64 v;
+
+	if (crc.compression_type) {
+		unsigned old_sectors, new_sectors;
+
+		if (sectors > 0) {
+			old_sectors = 0;
+			new_sectors = sectors;
+		} else {
+			old_sectors = e.k->size;
+			new_sectors = e.k->size + sectors;
+		}
+
+		sectors = -__disk_sectors(crc, old_sectors)
+			  +__disk_sectors(crc, new_sectors);
+	}
+
+	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
+		if (journal_seq)
+			bucket_cmpxchg(g, new, ({
+				new.journal_seq_valid	= 1;
+				new.journal_seq		= journal_seq;
+			}));
+
+		return;
+	}
+
+	v = atomic64_read(&g->_mark.v);
+	do {
+		new.v.counter = old.v.counter = v;
+		saturated = 0;
+
+		/*
+		 * Check this after reading bucket mark to guard against
+		 * the allocator invalidating a bucket after we've already
+		 * checked the gen
+		 */
+		if (gen_after(new.gen, ptr->gen)) {
+			BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
+			EBUG_ON(!ptr->cached &&
+				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
+			return;
+		}
+
+		if (!ptr->cached &&
+		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
+		    sectors < 0)
+			saturated = -sectors;
+
+		if (ptr->cached)
+			saturated_add(ca, new.cached_sectors, sectors,
+				      GC_MAX_SECTORS_USED);
+		else
+			saturated_add(ca, new.dirty_sectors, sectors,
+				      GC_MAX_SECTORS_USED);
+
+		if (!new.dirty_sectors &&
+		    !new.cached_sectors) {
+			new.data_type	= 0;
+
+			if (journal_seq) {
+				new.journal_seq_valid = 1;
+				new.journal_seq = journal_seq;
+			}
+		} else {
+			new.data_type = data_type;
+		}
+
+		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+			g->_mark = new;
+			break;
+		}
+	} while ((v = atomic64_cmpxchg(&g->_mark.v,
+			      old.v.counter,
+			      new.v.counter)) != old.v.counter);
+
+	bch2_dev_usage_update(c, ca, old, new);
+
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
+
+	if (saturated &&
+	    atomic_long_add_return(saturated,
+				   &ca->saturated_count) >=
+	    bucket_to_sector(ca, ca->free_inc.size)) {
+		if (c->gc_thread) {
+			trace_gc_sectors_saturated(c);
+			wake_up_process(c->gc_thread);
+		}
+	}
+}
+
+void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+		   s64 sectors, bool metadata,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
+{
+	/*
+	 * synchronization w.r.t. GC:
+	 *
+	 * Normally, bucket sector counts/marks are updated on the fly, as
+	 * references are added/removed from the btree, the lists of buckets the
+	 * allocator owns, other metadata buckets, etc.
+	 *
+	 * When GC is in progress and going to mark this reference, we do _not_
+	 * mark this reference here, to avoid double counting - GC will count it
+	 * when it gets to it.
+	 *
+	 * To know whether we should mark a given reference (GC either isn't
+	 * running, or has already marked references at this position) we
+	 * construct a total order for everything GC walks. Then, we can simply
+	 * compare the position of the reference we're marking - @pos - with
+	 * GC's current position. If GC is going to mark this reference, GC's
+	 * current position will be less than @pos; if GC's current position is
+	 * greater than @pos GC has either already walked this position, or
+	 * isn't running.
+	 *
+	 * To avoid racing with GC's position changing, we have to deal with
+	 *  - GC's position being set to GC_POS_MIN when GC starts:
+	 *    usage_lock guards against this
+	 *  - GC's position overtaking @pos: we guard against this with
+	 *    whatever lock protects the data structure the reference lives in
+	 *    (e.g. the btree node lock, or the relevant allocator lock).
+	 */
+
+	percpu_down_read(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+	if (!stats)
+		stats = this_cpu_ptr(c->usage_percpu);
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		enum s_alloc type = metadata ? S_META : S_DIRTY;
+		unsigned replicas = 0;
+
+		BUG_ON(metadata && bkey_extent_is_cached(e.k));
+		BUG_ON(!sectors);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+					  stats, journal_seq, flags);
+			replicas += !ptr->cached;
+		}
+
+		if (replicas) {
+			BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
+			stats->s[replicas - 1].data[type] += sectors;
+		}
+		break;
+	}
+	case BCH_RESERVATION: {
+		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+		if (r.v->nr_replicas) {
+			BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
+			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+		}
+		break;
+	}
+	}
+	percpu_up_read(&c->usage_lock);
+}
+
+/* Disk reservations: */
+
+static u64 __recalc_sectors_available(struct bch_fs *c)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+
+	return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+}
+
+/* Used by gc when it's starting: */
+void bch2_recalc_sectors_available(struct bch_fs *c)
+{
+	percpu_down_write(&c->usage_lock);
+	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
+	percpu_up_write(&c->usage_lock);
+}
+
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
+{
+	percpu_down_read(&c->usage_lock);
+	this_cpu_sub(c->usage_percpu->online_reserved,
+		     res->sectors);
+
+	bch2_fs_stats_verify(c);
+	percpu_up_read(&c->usage_lock);
+
+	res->sectors = 0;
+}
+
+#define SECTORS_CACHE	1024
+
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      unsigned sectors, int flags)
+{
+	struct bch_fs_usage *stats;
+	u64 old, v, get;
+	s64 sectors_available;
+	int ret;
+
+	percpu_down_read(&c->usage_lock);
+	preempt_disable();
+	stats = this_cpu_ptr(c->usage_percpu);
+
+	if (sectors <= stats->available_cache)
+		goto out;
+
+	v = atomic64_read(&c->sectors_available);
+	do {
+		old = v;
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
+			preempt_enable();
+			percpu_up_read(&c->usage_lock);
+			goto recalculate;
+		}
+	} while ((v = atomic64_cmpxchg(&c->sectors_available,
+				       old, old - get)) != old);
+
+	stats->available_cache	+= get;
+
+out:
+	stats->available_cache	-= sectors;
+	stats->online_reserved	+= sectors;
+	res->sectors		+= sectors;
+
+	bch2_disk_reservations_verify(c, flags);
+	bch2_fs_stats_verify(c);
+	preempt_enable();
+	percpu_up_read(&c->usage_lock);
+	return 0;
+
+recalculate:
+	/*
+	 * GC recalculates sectors_available when it starts, so that hopefully
+	 * we don't normally end up blocking here:
+	 */
+
+	/*
+	 * Piss fuck, we can be called from extent_insert_fixup() with btree
+	 * locks held:
+	 */
+
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
+		if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
+			down_read(&c->gc_lock);
+		else if (!down_read_trylock(&c->gc_lock))
+			return -EINTR;
+	}
+
+	percpu_down_write(&c->usage_lock);
+	sectors_available = __recalc_sectors_available(c);
+
+	if (sectors <= sectors_available ||
+	    (flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		atomic64_set(&c->sectors_available,
+			     max_t(s64, 0, sectors_available - sectors));
+		stats->online_reserved	+= sectors;
+		res->sectors		+= sectors;
+		ret = 0;
+
+		bch2_disk_reservations_verify(c, flags);
+	} else {
+		atomic64_set(&c->sectors_available, sectors_available);
+		ret = -ENOSPC;
+	}
+
+	bch2_fs_stats_verify(c);
+	percpu_up_write(&c->usage_lock);
+
+	if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
+		up_read(&c->gc_lock);
+
+	return ret;
+}
+
+/* Startup/shutdown: */
+
+static void buckets_free_rcu(struct rcu_head *rcu)
+{
+	struct bucket_array *buckets =
+		container_of(rcu, struct bucket_array, rcu);
+
+	kvpfree(buckets,
+		sizeof(struct bucket_array) +
+		buckets->nbuckets * sizeof(struct bucket));
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
+{
+	struct bucket_array *buckets = NULL, *old_buckets = NULL;
+	unsigned long *buckets_dirty = NULL;
+	u8 *oldest_gens = NULL;
+	alloc_fifo	free[RESERVE_NR];
+	alloc_fifo	free_inc;
+	alloc_heap	alloc_heap;
+	copygc_heap	copygc_heap;
+
+	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
+			     ca->mi.bucket_size / c->opts.btree_node_size);
+	/* XXX: these should be tunable */
+	size_t reserve_none	= max_t(size_t, 4, ca->mi.nbuckets >> 9);
+	size_t copygc_reserve	= max_t(size_t, 16, ca->mi.nbuckets >> 7);
+	size_t free_inc_reserve = copygc_reserve / 2;
+	bool resize = ca->buckets != NULL,
+	     start_copygc = ca->copygc_thread != NULL;
+	int ret = -ENOMEM;
+	unsigned i;
+
+	memset(&free,		0, sizeof(free));
+	memset(&free_inc,	0, sizeof(free_inc));
+	memset(&alloc_heap,	0, sizeof(alloc_heap));
+	memset(&copygc_heap,	0, sizeof(copygc_heap));
+
+	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
+					    nbuckets * sizeof(struct bucket),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(oldest_gens	= kvpmalloc(nbuckets * sizeof(u8),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !(buckets_dirty	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
+					    sizeof(unsigned long),
+					    GFP_KERNEL|__GFP_ZERO)) ||
+	    !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_MOVINGGC],
+		       copygc_reserve, GFP_KERNEL) ||
+	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
+	    !init_fifo(&free_inc,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_heap(&alloc_heap,	free_inc_reserve, GFP_KERNEL) ||
+	    !init_heap(&copygc_heap,	copygc_reserve, GFP_KERNEL))
+		goto err;
+
+	buckets->first_bucket	= ca->mi.first_bucket;
+	buckets->nbuckets	= nbuckets;
+
+	bch2_copygc_stop(ca);
+
+	if (resize) {
+		down_write(&c->gc_lock);
+		down_write(&ca->bucket_lock);
+		percpu_down_write(&c->usage_lock);
+	}
+
+	old_buckets = bucket_array(ca);
+
+	if (resize) {
+		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+
+		memcpy(buckets->b,
+		       old_buckets->b,
+		       n * sizeof(struct bucket));
+		memcpy(oldest_gens,
+		       ca->oldest_gens,
+		       n * sizeof(u8));
+		memcpy(buckets_dirty,
+		       ca->buckets_dirty,
+		       BITS_TO_LONGS(n) * sizeof(unsigned long));
+	}
+
+	rcu_assign_pointer(ca->buckets, buckets);
+	buckets = old_buckets;
+
+	swap(ca->oldest_gens, oldest_gens);
+	swap(ca->buckets_dirty, buckets_dirty);
+
+	if (resize)
+		percpu_up_write(&c->usage_lock);
+
+	spin_lock(&c->freelist_lock);
+	for (i = 0; i < RESERVE_NR; i++) {
+		fifo_move(&free[i], &ca->free[i]);
+		swap(ca->free[i], free[i]);
+	}
+	fifo_move(&free_inc, &ca->free_inc);
+	swap(ca->free_inc, free_inc);
+	spin_unlock(&c->freelist_lock);
+
+	/* with gc lock held, alloc_heap can't be in use: */
+	swap(ca->alloc_heap, alloc_heap);
+
+	/* and we shut down copygc: */
+	swap(ca->copygc_heap, copygc_heap);
+
+	nbuckets = ca->mi.nbuckets;
+
+	if (resize) {
+		up_write(&ca->bucket_lock);
+		up_write(&c->gc_lock);
+	}
+
+	if (start_copygc &&
+	    bch2_copygc_start(c, ca))
+		bch_err(ca, "error restarting copygc thread");
+
+	ret = 0;
+err:
+	free_heap(&copygc_heap);
+	free_heap(&alloc_heap);
+	free_fifo(&free_inc);
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&free[i]);
+	kvpfree(buckets_dirty,
+		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvpfree(oldest_gens,
+		nbuckets * sizeof(u8));
+	if (buckets)
+		call_rcu(&old_buckets->rcu, buckets_free_rcu);
+
+	return ret;
+}
+
+void bch2_dev_buckets_free(struct bch_dev *ca)
+{
+	unsigned i;
+
+	free_heap(&ca->copygc_heap);
+	free_heap(&ca->alloc_heap);
+	free_fifo(&ca->free_inc);
+	for (i = 0; i < RESERVE_NR; i++)
+		free_fifo(&ca->free[i]);
+	kvpfree(ca->buckets_dirty,
+		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+	kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
+	kvpfree(rcu_dereference_protected(ca->buckets, 1),
+		sizeof(struct bucket_array) +
+		ca->mi.nbuckets * sizeof(struct bucket));
+
+	free_percpu(ca->usage_percpu);
+}
+
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
+{
+	if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+		return -ENOMEM;
+
+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
+}
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Code for manipulating bucket marks for garbage collection.
+ *
+ * Copyright 2014 Datera, Inc.
+ */
+
+#ifndef _BUCKETS_H
+#define _BUCKETS_H
+
+#include "buckets_types.h"
+#include "super.h"
+
+#define for_each_bucket(_b, _buckets)				\
+	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
+	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
+
+#define bucket_cmpxchg(g, new, expr)				\
+({								\
+	u64 _v = atomic64_read(&(g)->_mark.v);			\
+	struct bucket_mark _old;				\
+								\
+	do {							\
+		(new).v.counter = _old.v.counter = _v;		\
+		expr;						\
+	} while ((_v = atomic64_cmpxchg(&(g)->_mark.v,		\
+			       _old.v.counter,			\
+			       (new).v.counter)) != _old.v.counter);\
+	_old;							\
+})
+
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+	return rcu_dereference_check(ca->buckets,
+				     !ca->fs ||
+				     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
+				     lockdep_is_held(&ca->fs->gc_lock) ||
+				     lockdep_is_held(&ca->bucket_lock));
+}
+
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+	struct bucket_array *buckets = bucket_array(ca);
+
+	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+	return buckets->b + b;
+}
+
+static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
+					 size_t b, int rw)
+{
+	bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+	return c->bucket_clock[rw].hand - g->io_time[rw];
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree.
+ */
+
+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+{
+	return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+}
+
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
+				   const struct bch_extent_ptr *ptr)
+{
+	return sector_to_bucket(ca, ptr->offset);
+}
+
+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
+					const struct bch_extent_ptr *ptr)
+{
+	return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+}
+
+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
+						 const struct bch_extent_ptr *ptr)
+{
+	struct bucket_mark m;
+
+	rcu_read_lock();
+	m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
+	rcu_read_unlock();
+
+	return m;
+}
+
+static inline int gen_cmp(u8 a, u8 b)
+{
+	return (s8) (a - b);
+}
+
+static inline int gen_after(u8 a, u8 b)
+{
+	int r = gen_cmp(a, b);
+
+	return r > 0 ? r : 0;
+}
+
+/**
+ * ptr_stale() - check if a pointer points into a bucket that has been
+ * invalidated.
+ */
+static inline u8 ptr_stale(struct bch_dev *ca,
+			   const struct bch_extent_ptr *ptr)
+{
+	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
+}
+
+/* bucket gc marks */
+
+/* The dirty and cached sector counts saturate. If this occurs,
+ * reference counting alone will not free the bucket, and a btree
+ * GC must be performed. */
+#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
+
+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
+{
+	return mark.dirty_sectors + mark.cached_sectors;
+}
+
+static inline bool bucket_unused(struct bucket_mark mark)
+{
+	return !mark.owned_by_allocator &&
+		!mark.data_type &&
+		!bucket_sectors_used(mark);
+}
+
+/* Device usage: */
+
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
+
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
+					  struct bch_dev_usage stats)
+{
+	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	if (WARN_ONCE(stats.buckets_unavailable > total,
+		      "buckets_unavailable overflow (%llu > %llu)\n",
+		      stats.buckets_unavailable, total))
+		return 0;
+
+	return total - stats.buckets_unavailable;
+}
+
+/*
+ * Number of reclaimable buckets - only for use by the allocator thread:
+ */
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
+}
+
+static inline u64 __dev_buckets_free(struct bch_dev *ca,
+				     struct bch_dev_usage stats)
+{
+	return __dev_buckets_available(ca, stats) +
+		fifo_used(&ca->free[RESERVE_NONE]) +
+		fifo_used(&ca->free_inc);
+}
+
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
+{
+	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
+}
+
+/* Filesystem usage: */
+
+static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
+{
+	switch (s) {
+	case S_META:
+		return BCH_DATA_BTREE;
+	case S_DIRTY:
+		return BCH_DATA_USER;
+	default:
+		BUG();
+	}
+}
+
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
+void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+			 struct disk_reservation *, struct gc_pos);
+
+u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
+
+static inline bool is_available_bucket(struct bucket_mark mark)
+{
+	return (!mark.owned_by_allocator &&
+		!mark.dirty_sectors &&
+		!mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+					       u16 last_seq_ondisk)
+{
+	return m.journal_seq_valid &&
+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
+}
+
+void bch2_bucket_seq_cleanup(struct bch_fs *);
+
+bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+			    size_t, struct bucket_mark *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+			    size_t, bool, struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			       size_t, enum bch_data_type, unsigned,
+			       struct gc_pos, unsigned);
+
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
+#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
+
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
+		   struct bch_fs_usage *, u64, unsigned);
+
+void bch2_recalc_sectors_available(struct bch_fs *);
+
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors)
+		__bch2_disk_reservation_put(c, res);
+}
+
+#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
+#define BCH_DISK_RESERVATION_GC_LOCK_HELD	(1 << 1)
+#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD	(1 << 2)
+
+int bch2_disk_reservation_add(struct bch_fs *,
+			     struct disk_reservation *,
+			     unsigned, int);
+
+static inline struct disk_reservation
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
+{
+	return (struct disk_reservation) {
+		.sectors	= 0,
+#if 0
+		/* not used yet: */
+		.gen		= c->capacity_gen,
+#endif
+		.nr_replicas	= nr_replicas,
+	};
+}
+
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
+					    struct disk_reservation *res,
+					    unsigned sectors,
+					    unsigned nr_replicas,
+					    int flags)
+{
+	*res = bch2_disk_reservation_init(c, nr_replicas);
+
+	return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
+}
+
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
+void bch2_dev_buckets_free(struct bch_dev *);
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
+
+#endif /* _BUCKETS_H */
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_TYPES_H
+#define _BUCKETS_TYPES_H
+
+#include "util.h"
+
+struct bucket_mark {
+	union {
+	struct {
+		atomic64_t	v;
+	};
+
+	struct {
+		u8		gen;
+		u8		data_type:3,
+				gen_valid:1,
+				owned_by_allocator:1,
+				nouse:1,
+				journal_seq_valid:1;
+		u16		dirty_sectors;
+		u16		cached_sectors;
+
+		/*
+		 * low bits of journal sequence number when this bucket was most
+		 * recently modified: if journal_seq_valid is set, this bucket
+		 * can't be reused until the journal sequence number written to
+		 * disk is >= the bucket's journal sequence number:
+		 */
+		u16		journal_seq;
+	};
+	};
+};
+
+struct bucket {
+	union {
+		struct bucket_mark	_mark;
+		const struct bucket_mark mark;
+	};
+
+	u16				io_time[2];
+};
+
+struct bucket_array {
+	struct rcu_head		rcu;
+	u16			first_bucket;
+	size_t			nbuckets;
+	struct bucket		b[];
+};
+
+struct bch_dev_usage {
+	u64			buckets[BCH_DATA_NR];
+	u64			buckets_alloc;
+	u64			buckets_unavailable;
+
+	/* _compressed_ sectors: */
+	u64			sectors[BCH_DATA_NR];
+	u64			sectors_fragmented;
+};
+
+/* kill, switch to bch_data_type? */
+enum s_alloc {
+	S_META,
+	S_DIRTY,
+	S_ALLOC_NR,
+};
+
+struct bch_fs_usage {
+	/* all fields are in units of 512 byte sectors: */
+	/* _uncompressed_ sectors: */
+	u64			online_reserved;
+	u64			available_cache;
+
+	struct {
+		u64		data[S_ALLOC_NR];
+		u64		persistent_reserved;
+	}			s[BCH_REPLICAS_MAX];
+};
+
+/*
+ * A reservation for space on disk:
+ */
+struct disk_reservation {
+	u64		sectors;
+	u32		gen;
+	unsigned	nr_replicas;
+};
+
+struct copygc_heap_entry {
+	u8			gen;
+	u32			sectors;
+	u64			offset;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
+#endif /* _BUCKETS_TYPES_H */
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@ -0,0 +1,663 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_CHARDEV
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "bcachefs_ioctl.h"
+#include "buckets.h"
+#include "chardev.h"
+#include "move.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
+					  unsigned flags)
+{
+	struct bch_dev *ca;
+
+	if (flags & BCH_BY_INDEX) {
+		if (dev >= c->sb.nr_devices)
+			return ERR_PTR(-EINVAL);
+
+		rcu_read_lock();
+		ca = rcu_dereference(c->devs[dev]);
+		if (ca)
+			percpu_ref_get(&ca->ref);
+		rcu_read_unlock();
+
+		if (!ca)
+			return ERR_PTR(-EINVAL);
+	} else {
+		char *path;
+
+		path = strndup_user((const char __user *)
+				    (unsigned long) dev, PATH_MAX);
+		if (IS_ERR(path))
+			return ERR_CAST(path);
+
+		ca = bch2_dev_lookup(c, path);
+		kfree(path);
+	}
+
+	return ca;
+}
+
+#if 0
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
+{
+	struct bch_ioctl_assemble arg;
+	struct bch_fs *c;
+	u64 *user_devs = NULL;
+	char **devs = NULL;
+	unsigned i;
+	int ret = -EFAULT;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
+	if (!user_devs)
+		return -ENOMEM;
+
+	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
+
+	if (copy_from_user(user_devs, user_arg->devs,
+			   sizeof(u64) * arg.nr_devs))
+		goto err;
+
+	for (i = 0; i < arg.nr_devs; i++) {
+		devs[i] = strndup_user((const char __user *)(unsigned long)
+				       user_devs[i],
+				       PATH_MAX);
+		if (!devs[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+	}
+
+	c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
+	ret = PTR_ERR_OR_ZERO(c);
+	if (!ret)
+		closure_put(&c->cl);
+err:
+	if (devs)
+		for (i = 0; i < arg.nr_devs; i++)
+			kfree(devs[i]);
+	kfree(devs);
+	return ret;
+}
+
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
+{
+	struct bch_ioctl_incremental arg;
+	const char *err;
+	char *path;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	err = bch2_fs_open_incremental(path);
+	kfree(path);
+
+	if (err) {
+		pr_err("Could not register bcachefs devices: %s", err);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif
+
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
+{
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_ASSEMBLE:
+		return bch2_ioctl_assemble(arg);
+	case BCH_IOCTL_INCREMENTAL:
+		return bch2_ioctl_incremental(arg);
+#endif
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
+			struct bch_ioctl_query_uuid __user *user_arg)
+{
+	return copy_to_user(&user_arg->uuid,
+			    &c->sb.user_uuid,
+			    sizeof(c->sb.user_uuid));
+}
+
+#if 0
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
+{
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	return bch2_fs_start(c) ? -EIO : 0;
+}
+
+static long bch2_ioctl_stop(struct bch_fs *c)
+{
+	bch2_fs_stop(c);
+	return 0;
+}
+#endif
+
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch2_dev_add(c, path);
+	kfree(path);
+
+	return ret;
+}
+
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	return bch2_dev_remove(c, ca, arg.flags);
+}
+
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	char *path;
+	int ret;
+
+	if (arg.flags || arg.pad)
+		return -EINVAL;
+
+	path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
+	if (!path)
+		return -ENOMEM;
+
+	ret = bch2_dev_online(c, path);
+	kfree(path);
+	return ret;
+}
+
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_offline(c, ca, arg.flags);
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
+			struct bch_ioctl_disk_set_state arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
+			   BCH_FORCE_IF_METADATA_LOST|
+			   BCH_FORCE_IF_DEGRADED|
+			   BCH_BY_INDEX)) ||
+	    arg.pad[0] || arg.pad[1] || arg.pad[2])
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+struct bch_data_ctx {
+	struct bch_fs			*c;
+	struct bch_ioctl_data		arg;
+	struct bch_move_stats		stats;
+
+	int				ret;
+
+	struct task_struct		*thread;
+};
+
+static int bch2_data_thread(void *arg)
+{
+	struct bch_data_ctx *ctx = arg;
+
+	ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+	ctx->stats.data_type = U8_MAX;
+	return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+
+	kthread_stop(ctx->thread);
+	put_task_struct(ctx->thread);
+	kfree(ctx);
+	return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+				  size_t len, loff_t *ppos)
+{
+	struct bch_data_ctx *ctx = file->private_data;
+	struct bch_fs *c = ctx->c;
+	struct bch_ioctl_data_event e = {
+		.type			= BCH_DATA_EVENT_PROGRESS,
+		.p.data_type		= ctx->stats.data_type,
+		.p.btree_id		= ctx->stats.iter.btree_id,
+		.p.pos			= ctx->stats.iter.pos,
+		.p.sectors_done		= atomic64_read(&ctx->stats.sectors_seen),
+		.p.sectors_total	= bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+	};
+
+	if (len < sizeof(e))
+		return -EINVAL;
+
+	return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+}
+
+static const struct file_operations bcachefs_data_ops = {
+	.release	= bch2_data_job_release,
+	.read		= bch2_data_job_read,
+	.llseek		= no_llseek,
+};
+
+static long bch2_ioctl_data(struct bch_fs *c,
+			    struct bch_ioctl_data arg)
+{
+	struct bch_data_ctx *ctx = NULL;
+	struct file *file = NULL;
+	unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+	int ret, fd = -1;
+
+	if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->c = c;
+	ctx->arg = arg;
+
+	ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+	if (IS_ERR(ctx->thread)) {
+		ret = PTR_ERR(ctx->thread);
+		goto err;
+	}
+
+	ret = get_unused_fd_flags(flags);
+	if (ret < 0)
+		goto err;
+	fd = ret;
+
+	file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err;
+	}
+
+	fd_install(fd, file);
+
+	get_task_struct(ctx->thread);
+	wake_up_process(ctx->thread);
+
+	return fd;
+err:
+	if (fd >= 0)
+		put_unused_fd(fd);
+	if (!IS_ERR_OR_NULL(ctx->thread))
+		kthread_stop(ctx->thread);
+	kfree(ctx);
+	return ret;
+}
+
+static long bch2_ioctl_usage(struct bch_fs *c,
+			     struct bch_ioctl_usage __user *user_arg)
+{
+	struct bch_ioctl_usage arg;
+	struct bch_dev *ca;
+	unsigned i, j;
+	int ret;
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	for (i = 0; i < arg.nr_devices; i++) {
+		struct bch_ioctl_dev_usage dst = { .alive = 0 };
+
+		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	{
+		struct bch_fs_usage src = bch2_fs_usage_read(c);
+		struct bch_ioctl_fs_usage dst = {
+			.capacity		= c->capacity,
+			.used			= bch2_fs_sectors_used(c, src),
+			.online_reserved	= src.online_reserved,
+		};
+
+		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+			dst.persistent_reserved[i] =
+				src.s[i].persistent_reserved;
+
+			for (j = 0; j < S_ALLOC_NR; j++)
+				dst.sectors[s_alloc_to_data_type(j)][i] =
+					src.s[i].data[j];
+		}
+
+		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
+		struct bch_ioctl_dev_usage dst = {
+			.alive		= 1,
+			.state		= ca->mi.state,
+			.bucket_size	= ca->mi.bucket_size,
+			.nr_buckets	= ca->mi.nbuckets - ca->mi.first_bucket,
+		};
+
+		if (ca->dev_idx >= arg.nr_devices) {
+			percpu_ref_put(&ca->ref);
+			return -ERANGE;
+		}
+
+		if (percpu_ref_tryget(&ca->io_ref)) {
+			dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
+			percpu_ref_put(&ca->io_ref);
+		}
+
+		for (j = 0; j < BCH_DATA_NR; j++) {
+			dst.buckets[j] = src.buckets[j];
+			dst.sectors[j] = src.sectors[j];
+		}
+
+		ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static long bch2_ioctl_read_super(struct bch_fs *c,
+				  struct bch_ioctl_read_super arg)
+{
+	struct bch_dev *ca = NULL;
+	struct bch_sb *sb;
+	int ret = 0;
+
+	if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
+	    arg.pad)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	if (arg.flags & BCH_READ_DEV) {
+		ca = bch2_device_lookup(c, arg.dev, arg.flags);
+
+		if (IS_ERR(ca)) {
+			ret = PTR_ERR(ca);
+			goto err;
+		}
+
+		sb = ca->disk_sb.sb;
+	} else {
+		sb = c->disk_sb.sb;
+	}
+
+	if (vstruct_bytes(sb) > arg.size) {
+		ret = -ERANGE;
+		goto err;
+	}
+
+	ret = copy_to_user((void __user *)(unsigned long)arg.sb,
+			   sb, vstruct_bytes(sb));
+err:
+	if (ca)
+		percpu_ref_put(&ca->ref);
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
+				    struct bch_ioctl_disk_get_idx arg)
+{
+	dev_t dev = huge_decode_dev(arg.dev);
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		if (ca->disk_sb.bdev->bd_dev == dev) {
+			percpu_ref_put(&ca->io_ref);
+			return i;
+		}
+
+	return -ENOENT;
+}
+
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
+				   struct bch_ioctl_disk_resize arg)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	if ((arg.flags & ~BCH_BY_INDEX) ||
+	    arg.pad)
+		return -EINVAL;
+
+	ca = bch2_device_lookup(c, arg.dev, arg.flags);
+	if (IS_ERR(ca))
+		return PTR_ERR(ca);
+
+	ret = bch2_dev_resize(c, ca, arg.nbuckets);
+
+	percpu_ref_put(&ca->ref);
+	return ret;
+}
+
+#define BCH_IOCTL(_name, _argtype)					\
+do {									\
+	_argtype i;							\
+									\
+	if (copy_from_user(&i, arg, sizeof(i)))				\
+		return -EFAULT;						\
+	return bch2_ioctl_##_name(c, i);				\
+} while (0)
+
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
+{
+	/* ioctls that don't require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_QUERY_UUID:
+		return bch2_ioctl_query_uuid(c, arg);
+	case BCH_IOCTL_USAGE:
+		return bch2_ioctl_usage(c, arg);
+	}
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+#if 0
+	case BCH_IOCTL_START:
+		BCH_IOCTL(start, struct bch_ioctl_start);
+	case BCH_IOCTL_STOP:
+		return bch2_ioctl_stop(c);
+#endif
+	case BCH_IOCTL_READ_SUPER:
+		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+	case BCH_IOCTL_DISK_GET_IDX:
+		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+	}
+
+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	/* ioctls that do require admin cap: */
+	switch (cmd) {
+	case BCH_IOCTL_DISK_ADD:
+		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_REMOVE:
+		BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_ONLINE:
+		BCH_IOCTL(disk_online, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_OFFLINE:
+		BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
+	case BCH_IOCTL_DISK_SET_STATE:
+		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
+	case BCH_IOCTL_DATA:
+		BCH_IOCTL(data, struct bch_ioctl_data);
+	case BCH_IOCTL_DISK_RESIZE:
+		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static DEFINE_IDR(bch_chardev_minor);
+
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
+{
+	unsigned minor = iminor(file_inode(filp));
+	struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
+	void __user *arg = (void __user *) v;
+
+	return c
+		? bch2_fs_ioctl(c, cmd, arg)
+		: bch2_global_ioctl(cmd, arg);
+}
+
+static const struct file_operations bch_chardev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl = bch2_chardev_ioctl,
+	.open		= nonseekable_open,
+};
+
+static int bch_chardev_major;
+static struct class *bch_chardev_class;
+static struct device *bch_chardev;
+
+void bch2_fs_chardev_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->chardev))
+		device_unregister(c->chardev);
+	if (c->minor >= 0)
+		idr_remove(&bch_chardev_minor, c->minor);
+}
+
+int bch2_fs_chardev_init(struct bch_fs *c)
+{
+	c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
+	if (c->minor < 0)
+		return c->minor;
+
+	c->chardev = device_create(bch_chardev_class, NULL,
+				   MKDEV(bch_chardev_major, c->minor), c,
+				   "bcachefs%u-ctl", c->minor);
+	if (IS_ERR(c->chardev))
+		return PTR_ERR(c->chardev);
+
+	return 0;
+}
+
+void bch2_chardev_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		device_destroy(bch_chardev_class,
+			       MKDEV(bch_chardev_major, U8_MAX));
+	if (!IS_ERR_OR_NULL(bch_chardev_class))
+		class_destroy(bch_chardev_class);
+	if (bch_chardev_major > 0)
+		unregister_chrdev(bch_chardev_major, "bcachefs");
+}
+
+int __init bch2_chardev_init(void)
+{
+	bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
+	if (bch_chardev_major < 0)
+		return bch_chardev_major;
+
+	bch_chardev_class = class_create("bcachefs");
+	if (IS_ERR(bch_chardev_class))
+		return PTR_ERR(bch_chardev_class);
+
+	bch_chardev = device_create(bch_chardev_class, NULL,
+				    MKDEV(bch_chardev_major, U8_MAX),
+				    NULL, "bcachefs-ctl");
+	if (IS_ERR(bch_chardev))
+		return PTR_ERR(bch_chardev);
+
+	return 0;
+}
+
+#endif /* NO_BCACHEFS_CHARDEV */
--- a/fs/bcachefs/chardev.h
+++ b/fs/bcachefs/chardev.h
@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+
+#ifndef NO_BCACHEFS_FS
+
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+
+#else
+
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+				unsigned cmd, void __user * arg)
+{
+	return -ENOSYS;
+}
+
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_CHARDEV_H */
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@ -0,0 +1,753 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "super.h"
+#include "super-io.h"
+
+#include <linux/crc32c.h>
+#include <linux/crypto.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/hash.h>
+#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
+#include <keys/user-type.h>
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const u64 crc_table[256] = {
+	0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
+	0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
+	0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
+	0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
+	0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
+	0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
+	0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
+	0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
+	0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
+	0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
+	0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
+	0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
+	0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
+	0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
+	0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
+	0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
+	0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
+	0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
+	0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
+	0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
+	0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
+	0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
+	0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
+	0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
+	0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
+	0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
+	0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
+	0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
+	0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
+	0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
+	0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
+	0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
+	0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
+	0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
+	0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
+	0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
+	0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
+	0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
+	0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
+	0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
+	0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
+	0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
+	0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
+	0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
+	0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
+	0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
+	0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
+	0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
+	0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
+	0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
+	0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
+	0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
+	0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
+	0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
+	0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
+	0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
+	0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
+	0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
+	0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
+	0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
+	0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
+	0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
+	0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
+	0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
+	0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
+	0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
+	0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
+	0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
+	0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
+	0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
+	0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
+	0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
+	0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
+	0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
+	0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
+	0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
+	0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
+	0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
+	0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
+	0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
+	0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
+	0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
+	0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
+	0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
+	0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
+	0x9AFCE626CE85B507ULL,
+};
+
+u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
+{
+	const unsigned char *data = _data;
+
+	while (len--) {
+		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+		crc = crc_table[i] ^ (crc << 8);
+	}
+
+	return crc;
+}
+
+static u64 bch2_checksum_init(unsigned type)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return 0;
+	case BCH_CSUM_CRC64:
+		return 0;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_final(unsigned type, u64 crc)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+		return crc ^ U32_MAX;
+	case BCH_CSUM_CRC64_NONZERO:
+		return crc ^ U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return crc;
+	case BCH_CSUM_CRC64:
+		return crc;
+	default:
+		BUG();
+	}
+}
+
+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC32C:
+		return crc32c(crc, data, len);
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC64:
+		return bch2_crc64_update(crc, data, len);
+	default:
+		BUG();
+	}
+}
+
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
+				 struct nonce nonce,
+				 struct scatterlist *sg, size_t len)
+{
+	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+	int ret;
+
+	skcipher_request_set_sync_tfm(req, tfm);
+	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+
+	ret = crypto_skcipher_encrypt(req);
+	BUG_ON(ret);
+}
+
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
+			      struct nonce nonce,
+			      void *buf, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, buf, len);
+	do_encrypt_sg(tfm, nonce, &sg, len);
+}
+
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
+			    void *buf, size_t len)
+{
+	struct crypto_sync_skcipher *chacha20 =
+		crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	int ret;
+
+	if (!chacha20) {
+		pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
+		return PTR_ERR(chacha20);
+	}
+
+	ret = crypto_skcipher_setkey(&chacha20->base,
+				     (void *) key, sizeof(*key));
+	if (ret) {
+		pr_err("crypto_skcipher_setkey() error: %i", ret);
+		goto err;
+	}
+
+	do_encrypt(chacha20, nonce, buf, len);
+err:
+	crypto_free_sync_skcipher(chacha20);
+	return ret;
+}
+
+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
+			 struct nonce nonce)
+{
+	u8 key[POLY1305_KEY_SIZE];
+
+	nonce.d[3] ^= BCH_NONCE_POLY;
+
+	memset(key, 0, sizeof(key));
+	do_encrypt(c->chacha20, nonce, key, sizeof(key));
+
+	desc->tfm = c->poly1305;
+	crypto_shash_init(desc);
+	crypto_shash_update(desc, key, sizeof(key));
+}
+
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
+			      struct nonce nonce, const void *data, size_t len)
+{
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64: {
+		u64 crc = bch2_checksum_init(type);
+
+		crc = bch2_checksum_update(type, crc, data, len);
+		crc = bch2_checksum_final(type, crc);
+
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+	}
+
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+		crypto_shash_update(desc, data, len);
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+void bch2_encrypt(struct bch_fs *c, unsigned type,
+		  struct nonce nonce, void *data, size_t len)
+{
+	if (!bch2_csum_type_is_encryption(type))
+		return;
+
+	do_encrypt(c->chacha20, nonce, data, len);
+}
+
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
+{
+	struct bio_vec bv;
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+		return (struct bch_csum) { 0 };
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64: {
+		u64 crc = bch2_checksum_init(type);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+			crc = bch2_checksum_update(type,
+				crc, p, bv.bv_len);
+			kunmap_atomic(p);
+		}
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crc = bch2_checksum_update(type, crc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crc = bch2_checksum_final(type, crc);
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
+	}
+
+	case BCH_CSUM_CHACHA20_POLY1305_80:
+	case BCH_CSUM_CHACHA20_POLY1305_128: {
+		SHASH_DESC_ON_STACK(desc, c->poly1305);
+		u8 digest[POLY1305_DIGEST_SIZE];
+		struct bch_csum ret = { 0 };
+
+		gen_poly_key(c, desc, nonce);
+
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+
+			crypto_shash_update(desc, p, bv.bv_len);
+			kunmap_atomic(p);
+		}
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
+		crypto_shash_final(desc, digest);
+
+		memcpy(&ret, digest, bch_crc_bytes[type]);
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
+void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
+		      struct nonce nonce, struct bio *bio)
+{
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	struct scatterlist sgl[16], *sg = sgl;
+	size_t bytes = 0;
+
+	if (!bch2_csum_type_is_encryption(type))
+		return;
+
+	sg_init_table(sgl, ARRAY_SIZE(sgl));
+
+	bio_for_each_segment(bv, bio, iter) {
+		if (sg == sgl + ARRAY_SIZE(sgl)) {
+			sg_mark_end(sg - 1);
+			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+
+			nonce = nonce_add(nonce, bytes);
+			bytes = 0;
+
+			sg_init_table(sgl, ARRAY_SIZE(sgl));
+			sg = sgl;
+		}
+
+		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
+		bytes += bv.bv_len;
+	}
+
+	sg_mark_end(sg - 1);
+	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
+}
+
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct bch_csum bch2_checksum_merge(unsigned type,
+					   struct bch_csum a,
+					   struct bch_csum b, size_t b_len)
+{
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+
+		a.lo = bch2_checksum_update(type, a.lo,
+				page_address(ZERO_PAGE(0)), b);
+		b_len -= b;
+	}
+
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type },
+		{ crc_b, len_b, new_csum_type },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_old.compression_type);
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum))
+		return -EIO;
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
+#ifdef __KERNEL__
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	char key_description[60];
+	struct key *keyring_key;
+	const struct user_key_payload *ukp;
+	int ret;
+
+	snprintf(key_description, sizeof(key_description),
+		 "bcachefs:%pUb", &sb->user_uuid);
+
+	keyring_key = request_key(&key_type_logon, key_description, NULL);
+	if (IS_ERR(keyring_key))
+		return PTR_ERR(keyring_key);
+
+	down_read(&keyring_key->sem);
+	ukp = dereference_key_locked(keyring_key);
+	if (ukp->datalen == sizeof(*key)) {
+		memcpy(key, ukp->data, ukp->datalen);
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	up_read(&keyring_key->sem);
+	key_put(keyring_key);
+
+	return ret;
+}
+#else
+#include <keyutils.h>
+#include <uuid/uuid.h>
+
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+	key_serial_t key_id;
+	char key_description[60];
+	char uuid[40];
+
+	uuid_unparse_lower(sb->user_uuid.b, uuid);
+	sprintf(key_description, "bcachefs:%s", uuid);
+
+	key_id = request_key("user", key_description, NULL,
+			     KEY_SPEC_USER_KEYRING);
+	if (key_id < 0)
+		return -errno;
+
+	if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
+		return -1;
+
+	return 0;
+}
+#endif
+
+int bch2_decrypt_sb_key(struct bch_fs *c,
+			struct bch_sb_field_crypt *crypt,
+			struct bch_key *key)
+{
+	struct bch_encrypted_key sb_key = crypt->key;
+	struct bch_key user_key;
+	int ret = 0;
+
+	/* is key encrypted? */
+	if (!bch2_key_is_encrypted(&sb_key))
+		goto out;
+
+	ret = bch2_request_key(c->disk_sb.sb, &user_key);
+	if (ret) {
+		bch_err(c, "error requesting encryption key: %i", ret);
+		goto err;
+	}
+
+	/* decrypt real key: */
+	ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+			     &sb_key, sizeof(sb_key));
+	if (ret)
+		goto err;
+
+	if (bch2_key_is_encrypted(&sb_key)) {
+		bch_err(c, "incorrect encryption key");
+		ret = -EINVAL;
+		goto err;
+	}
+out:
+	*key = sb_key.key;
+err:
+	memzero_explicit(&sb_key, sizeof(sb_key));
+	memzero_explicit(&user_key, sizeof(user_key));
+	return ret;
+}
+
+static int bch2_alloc_ciphers(struct bch_fs *c)
+{
+	if (!c->chacha20)
+		c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
+	if (IS_ERR(c->chacha20)) {
+		bch_err(c, "error requesting chacha20 module: %li",
+			PTR_ERR(c->chacha20));
+		return PTR_ERR(c->chacha20);
+	}
+
+	if (!c->poly1305)
+		c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
+	if (IS_ERR(c->poly1305)) {
+		bch_err(c, "error requesting poly1305 module: %li",
+			PTR_ERR(c->poly1305));
+		return PTR_ERR(c->poly1305);
+	}
+
+	return 0;
+}
+
+int bch2_disable_encryption(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	if (!crypt)
+		goto out;
+
+	/* is key encrypted? */
+	ret = 0;
+	if (bch2_key_is_encrypted(&crypt->key))
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	crypt->key.magic	= BCH_KEY_MAGIC;
+	crypt->key.key		= key;
+
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
+	bch2_write_super(c);
+out:
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
+}
+
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
+{
+	struct bch_encrypted_key key;
+	struct bch_key user_key;
+	struct bch_sb_field_crypt *crypt;
+	int ret = -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+
+	/* Do we already have an encryption key? */
+	if (bch2_sb_get_crypt(c->disk_sb.sb))
+		goto err;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto err;
+
+	key.magic = BCH_KEY_MAGIC;
+	get_random_bytes(&key.key, sizeof(key.key));
+
+	if (keyed) {
+		ret = bch2_request_key(c->disk_sb.sb, &user_key);
+		if (ret) {
+			bch_err(c, "error requesting encryption key: %i", ret);
+			goto err;
+		}
+
+		ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
+					      &key, sizeof(key));
+		if (ret)
+			goto err;
+	}
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto err;
+
+	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+	if (!crypt) {
+		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
+		goto err;
+	}
+
+	crypt->key = key;
+
+	/* write superblock */
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	memzero_explicit(&user_key, sizeof(user_key));
+	memzero_explicit(&key, sizeof(key));
+	return ret;
+}
+
+void bch2_fs_encryption_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->poly1305))
+		crypto_free_shash(c->poly1305);
+	if (!IS_ERR_OR_NULL(c->chacha20))
+		crypto_free_sync_skcipher(c->chacha20);
+	if (!IS_ERR_OR_NULL(c->sha256))
+		crypto_free_shash(c->sha256);
+}
+
+int bch2_fs_encryption_init(struct bch_fs *c)
+{
+	struct bch_sb_field_crypt *crypt;
+	struct bch_key key;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	c->sha256 = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(c->sha256)) {
+		bch_err(c, "error requesting sha256 module");
+		ret = PTR_ERR(c->sha256);
+		goto out;
+	}
+
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+	if (!crypt)
+		goto out;
+
+	ret = bch2_alloc_ciphers(c);
+	if (ret)
+		goto out;
+
+	ret = bch2_decrypt_sb_key(c, crypt, &key);
+	if (ret)
+		goto out;
+
+	ret = crypto_skcipher_setkey(&c->chacha20->base,
+			(void *) &key.key, sizeof(key.key));
+	if (ret)
+		goto out;
+out:
+	memzero_explicit(&key, sizeof(key));
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHECKSUM_H
+#define _BCACHEFS_CHECKSUM_H
+
+#include "bcachefs.h"
+#include "extents_types.h"
+#include "super-io.h"
+
+#include <crypto/chacha.h>
+
+u64 bch2_crc64_update(u64, const void *, size_t);
+
+#define BCH_NONCE_EXTENT	cpu_to_le32(1 << 28)
+#define BCH_NONCE_BTREE		cpu_to_le32(2 << 28)
+#define BCH_NONCE_JOURNAL	cpu_to_le32(3 << 28)
+#define BCH_NONCE_PRIO		cpu_to_le32(4 << 28)
+#define BCH_NONCE_POLY		cpu_to_le32(1 << 31)
+
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
+			     const void *, size_t);
+
+/*
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first field of these structs
+ */
+#define csum_vstruct(_c, _type, _nonce, _i)				\
+({									\
+	const void *start = ((const void *) (_i)) + sizeof((_i)->csum);	\
+	const void *end = vstruct_end(_i);				\
+									\
+	bch2_checksum(_c, _type, _nonce, start, end - start);		\
+})
+
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+int bch2_request_key(struct bch_sb *, struct bch_key *);
+
+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
+		 void *data, size_t);
+
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
+				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
+void bch2_encrypt_bio(struct bch_fs *, unsigned,
+		    struct nonce, struct bio *);
+
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
+			struct bch_key *);
+
+int bch2_disable_encryption(struct bch_fs *);
+int bch2_enable_encryption(struct bch_fs *, bool);
+
+void bch2_fs_encryption_exit(struct bch_fs *);
+int bch2_fs_encryption_init(struct bch_fs *);
+
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
+{
+	switch (type) {
+	case BCH_CSUM_OPT_NONE:
+	     return BCH_CSUM_NONE;
+	case BCH_CSUM_OPT_CRC32C:
+	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+	case BCH_CSUM_OPT_CRC64:
+	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+	default:
+	     BUG();
+	}
+}
+
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+							 unsigned opt)
+{
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_CHACHA20_POLY1305_128
+			: BCH_CSUM_CHACHA20_POLY1305_80;
+
+	return bch2_csum_opt_to_type(opt, true);
+}
+
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
+{
+	if (c->sb.encryption_type)
+		return BCH_CSUM_CHACHA20_POLY1305_128;
+
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
+}
+
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
+					   unsigned type)
+{
+	if (type >= BCH_CSUM_NR)
+		return false;
+
+	if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+		return false;
+
+	return true;
+}
+
+/* returns true if not equal */
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
+{
+	/*
+	 * XXX: need some way of preventing the compiler from optimizing this
+	 * into a form that isn't constant time..
+	 */
+	return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
+}
+
+/* for skipping ahead and encrypting/decrypting at an offset: */
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
+{
+	EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
+
+	le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
+	return nonce;
+}
+
+static inline struct nonce null_nonce(void)
+{
+	struct nonce ret;
+
+	memset(&ret, 0, sizeof(ret));
+	return ret;
+}
+
+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
+{
+	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
+}
+
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
+{
+	__le64 magic = __bch2_sb_magic(sb);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
+{
+	__le64 magic = bch2_sb_magic(c);
+
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = 0,
+		[2] = ((__le32 *) &magic)[0],
+		[3] = ((__le32 *) &magic)[1],
+	}};
+}
+
+#endif /* _BCACHEFS_CHECKSUM_H */
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "clock.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/preempt.h>
+
+static inline long io_timer_cmp(io_timer_heap *h,
+				struct io_timer *l,
+				struct io_timer *r)
+{
+	return l->expire - r->expire;
+}
+
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer)
+			goto out;
+
+	BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
+out:
+	spin_unlock(&clock->timer_lock);
+}
+
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
+{
+	size_t i;
+
+	spin_lock(&clock->timer_lock);
+
+	for (i = 0; i < clock->timers.used; i++)
+		if (clock->timers.data[i] == timer) {
+			heap_del(&clock->timers, i, io_timer_cmp);
+			break;
+		}
+
+	spin_unlock(&clock->timer_lock);
+}
+
+struct io_clock_wait {
+	struct io_timer		io_timer;
+	struct timer_list	cpu_timer;
+	struct task_struct	*task;
+	int			expired;
+};
+
+static void io_clock_wait_fn(struct io_timer *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, io_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+	struct io_clock_wait *wait = container_of(timer,
+				struct io_clock_wait, cpu_timer);
+
+	wait->expired = 1;
+	wake_up_process(wait->task);
+}
+
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
+{
+	struct io_clock_wait wait;
+
+	/* XXX: calculate sleep time rigorously */
+	wait.io_timer.expire	= until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	schedule();
+
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+				unsigned long io_until,
+				unsigned long cpu_timeout)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct io_clock_wait wait;
+
+	wait.io_timer.expire	= io_until;
+	wait.io_timer.fn	= io_clock_wait_fn;
+	wait.task		= current;
+	wait.expired		= 0;
+	bch2_io_timer_add(clock, &wait.io_timer);
+
+	timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+	if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+		mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
+
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread && kthread_should_stop())
+			break;
+
+		if (wait.expired)
+			break;
+
+		schedule();
+		try_to_freeze();
+	}
+
+	__set_current_state(TASK_RUNNING);
+	del_timer_sync(&wait.cpu_timer);
+	destroy_timer_on_stack(&wait.cpu_timer);
+	bch2_io_timer_del(clock, &wait.io_timer);
+}
+
+static struct io_timer *get_expired_timer(struct io_clock *clock,
+					  unsigned long now)
+{
+	struct io_timer *ret = NULL;
+
+	spin_lock(&clock->timer_lock);
+
+	if (clock->timers.used &&
+	    time_after_eq(now, clock->timers.data[0]->expire))
+		heap_pop(&clock->timers, ret, io_timer_cmp);
+
+	spin_unlock(&clock->timer_lock);
+
+	return ret;
+}
+
+void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
+{
+	struct io_clock *clock = &c->io_clock[rw];
+	struct io_timer *timer;
+	unsigned long now;
+
+	/* Buffer up one megabyte worth of IO in the percpu counter */
+	preempt_disable();
+
+	if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
+		   IO_CLOCK_PCPU_SECTORS)) {
+		preempt_enable();
+		return;
+	}
+
+	sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
+	preempt_enable();
+	now = atomic_long_add_return(sectors, &clock->now);
+
+	while ((timer = get_expired_timer(clock, now)))
+		timer->fn(timer);
+}
+
+void bch2_io_clock_exit(struct io_clock *clock)
+{
+	free_heap(&clock->timers);
+	free_percpu(clock->pcpu_buf);
+}
+
+int bch2_io_clock_init(struct io_clock *clock)
+{
+	atomic_long_set(&clock->now, 0);
+	spin_lock_init(&clock->timer_lock);
+
+	clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
+	if (!clock->pcpu_buf)
+		return -ENOMEM;
+
+	if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
+		return -ENOMEM;
+
+	return 0;
+}
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
+void bch2_increment_clock(struct bch_fs *, unsigned, int);
+
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+
+#endif /* _BCACHEFS_CLOCK_H */
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_TYPES_H
+#define _BCACHEFS_CLOCK_TYPES_H
+
+#include "util.h"
+
+#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)
+
+/*
+ * Clocks/timers in units of sectors of IO:
+ *
+ * Note - they use percpu batching, so they're only approximate.
+ */
+
+struct io_timer;
+typedef void (*io_timer_fn)(struct io_timer *);
+
+struct io_timer {
+	io_timer_fn		fn;
+	unsigned long		expire;
+};
+
+/* Amount to buffer up on a percpu counter */
+#define IO_CLOCK_PCPU_SECTORS	128
+
+typedef HEAP(struct io_timer *)	io_timer_heap;
+
+struct io_clock {
+	atomic_long_t		now;
+	u16 __percpu		*pcpu_buf;
+
+	spinlock_t		timer_lock;
+	io_timer_heap		timers;
+};
+
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@ -0,0 +1,621 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "checksum.h"
+#include "compress.h"
+#include "extents.h"
+#include "io.h"
+#include "super-io.h"
+
+#include <linux/lz4.h>
+#include <linux/zlib.h>
+#include <linux/zstd.h>
+
+/* Bounce buffer: */
+struct bbuf {
+	void		*b;
+	enum {
+		BB_NONE,
+		BB_VMAP,
+		BB_KMALLOC,
+		BB_VMALLOC,
+		BB_MEMPOOL,
+	}		type;
+	int		rw;
+};
+
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
+{
+	void *b;
+
+	BUG_ON(size > c->sb.encoded_extent_max << 9);
+
+	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	b = vmalloc(size);
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
+
+	b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
+	b = b ? page_address(b) : NULL;
+	if (b)
+		return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
+
+	BUG();
+}
+
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
+				       struct bvec_iter start, int rw)
+{
+	struct bbuf ret;
+	struct bio_vec bv;
+	struct bvec_iter iter;
+	unsigned nr_pages = 0;
+	struct page *stack_pages[16];
+	struct page **pages = NULL;
+	bool first = true;
+	unsigned prev_end = PAGE_SIZE;
+	void *data;
+
+	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+
+#ifndef CONFIG_HIGHMEM
+	__bio_for_each_contig_segment(bv, bio, iter, start) {
+		if (bv.bv_len == start.bi_size)
+			return (struct bbuf) {
+				.b = page_address(bv.bv_page) + bv.bv_offset,
+				.type = BB_NONE, .rw = rw
+			};
+	}
+#endif
+	__bio_for_each_segment(bv, bio, iter, start) {
+		if ((!first && bv.bv_offset) ||
+		    prev_end != PAGE_SIZE)
+			goto bounce;
+
+		prev_end = bv.bv_offset + bv.bv_len;
+		nr_pages++;
+	}
+
+	BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
+
+	pages = nr_pages > ARRAY_SIZE(stack_pages)
+		? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
+		: stack_pages;
+	if (!pages)
+		goto bounce;
+
+	nr_pages = 0;
+	__bio_for_each_segment(bv, bio, iter, start)
+		pages[nr_pages++] = bv.bv_page;
+
+	data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (pages != stack_pages)
+		kfree(pages);
+
+	if (data)
+		return (struct bbuf) {
+			.b = data + bio_iter_offset(bio, start),
+			.type = BB_VMAP, .rw = rw
+		};
+bounce:
+	ret = __bounce_alloc(c, start.bi_size, rw);
+
+	if (rw == READ)
+		memcpy_from_bio(ret.b, bio, start);
+
+	return ret;
+}
+
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
+{
+	return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
+}
+
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
+{
+	switch (buf.type) {
+	case BB_NONE:
+		break;
+	case BB_VMAP:
+		vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
+		break;
+	case BB_KMALLOC:
+		kfree(buf.b);
+		break;
+	case BB_VMALLOC:
+		vfree(buf.b);
+		break;
+	case BB_MEMPOOL:
+		mempool_free(virt_to_page(buf.b),
+			     &c->compression_bounce[buf.rw]);
+		break;
+	}
+}
+
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
+{
+#ifdef __KERNEL__
+	strm->workspace = workspace;
+#endif
+}
+
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf src_data = { NULL };
+	size_t src_len = src->bi_iter.bi_size;
+	size_t dst_len = crc.uncompressed_size << 9;
+	void *workspace;
+	int ret;
+
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	switch (crc.compression_type) {
+	case BCH_COMPRESSION_LZ4_OLD:
+	case BCH_COMPRESSION_LZ4:
+		ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
+						  src_len, dst_len, dst_len);
+		if (ret != dst_len)
+			goto err;
+		break;
+	case BCH_COMPRESSION_GZIP: {
+		z_stream strm = {
+			.next_in	= src_data.b,
+			.avail_in	= src_len,
+			.next_out	= dst_data,
+			.avail_out	= dst_len,
+		};
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_inflateInit2(&strm, -MAX_WBITS);
+		ret = zlib_inflate(&strm, Z_FINISH);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (ret != Z_STREAM_END)
+			goto err;
+		break;
+	}
+	case BCH_COMPRESSION_ZSTD: {
+		ZSTD_DCtx *ctx;
+		size_t len;
+
+		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+		ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
+
+		src_len = le32_to_cpup(src_data.b);
+
+		len = zstd_decompress_dctx(ctx,
+				dst_data,	dst_len,
+				src_data.b + 4, src_len);
+
+		mempool_free(workspace, &c->decompress_workspace);
+
+		if (len != dst_len)
+			goto err;
+		break;
+	}
+	default:
+		BUG();
+	}
+	ret = 0;
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	return ret;
+err:
+	ret = -EIO;
+	goto out;
+}
+
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
+				struct bch_extent_crc_unpacked *crc)
+{
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;
+
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
+
+	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc->compressed_size	> c->sb.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
+		return -EIO;
+	}
+
+	data = __bounce_alloc(c, dst_len, WRITE);
+
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
+	}
+
+	/*
+	 * might have to free existing pages and retry allocation from mempool -
+	 * do this _after_ decompressing:
+	 */
+	bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
+
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
+}
+
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
+		       struct bio *dst, struct bvec_iter dst_iter,
+		       struct bch_extent_crc_unpacked crc)
+{
+	struct bbuf dst_data = { NULL };
+	size_t dst_len = crc.uncompressed_size << 9;
+	int ret = -ENOMEM;
+
+	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc.compressed_size		> c->sb.encoded_extent_max)
+		return -EIO;
+
+	dst_data = dst_len == dst_iter.bi_size
+		? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
+		: __bounce_alloc(c, dst_len, WRITE);
+
+	ret = __bio_uncompress(c, src, dst_data.b, crc);
+	if (ret)
+		goto err;
+
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
+err:
+	bio_unmap_or_unbounce(c, dst_data);
+	return ret;
+}
+
+static int attempt_compress(struct bch_fs *c,
+			    void *workspace,
+			    void *dst, size_t dst_len,
+			    void *src, size_t src_len,
+			    unsigned compression_type)
+{
+	switch (compression_type) {
+	case BCH_COMPRESSION_LZ4: {
+		int len = src_len;
+		int ret = LZ4_compress_destSize(
+				src,		dst,
+				&len,		dst_len,
+				workspace);
+
+		if (len < src_len)
+			return -len;
+
+		return ret;
+	}
+	case BCH_COMPRESSION_GZIP: {
+		z_stream strm = {
+			.next_in	= src,
+			.avail_in	= src_len,
+			.next_out	= dst,
+			.avail_out	= dst_len,
+		};
+
+		zlib_set_workspace(&strm, workspace);
+		zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+				  Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+				  Z_DEFAULT_STRATEGY);
+
+		if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+			return 0;
+
+		if (zlib_deflateEnd(&strm) != Z_OK)
+			return 0;
+
+		return strm.total_out;
+	}
+	case BCH_COMPRESSION_ZSTD: {
+		ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
+			zstd_cctx_workspace_bound(&c->zstd_params.cParams));
+
+		size_t len = zstd_compress_cctx(ctx,
+				dst + 4,	dst_len - 4,
+				src,		src_len,
+				&c->zstd_params);
+		if (zstd_is_error(len))
+			return 0;
+
+		*((__le32 *) dst) = cpu_to_le32(len);
+		return len + 4;
+	}
+	default:
+		BUG();
+	}
+}
+
+static unsigned __bio_compress(struct bch_fs *c,
+			       struct bio *dst, size_t *dst_len,
+			       struct bio *src, size_t *src_len,
+			       unsigned compression_type)
+{
+	struct bbuf src_data = { NULL }, dst_data = { NULL };
+	void *workspace;
+	unsigned pad;
+	int ret = 0;
+
+	BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
+	/* If it's only one block, don't bother trying to compress: */
+	if (bio_sectors(src) <= c->opts.block_size)
+		return 0;
+
+	dst_data = bio_map_or_bounce(c, dst, WRITE);
+	src_data = bio_map_or_bounce(c, src, READ);
+
+	workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
+
+	*src_len = src->bi_iter.bi_size;
+	*dst_len = dst->bi_iter.bi_size;
+
+	/*
+	 * XXX: this algorithm sucks when the compression code doesn't tell us
+	 * how much would fit, like LZ4 does:
+	 */
+	while (1) {
+		if (*src_len <= block_bytes(c)) {
+			ret = -1;
+			break;
+		}
+
+		ret = attempt_compress(c, workspace,
+				       dst_data.b,	*dst_len,
+				       src_data.b,	*src_len,
+				       compression_type);
+		if (ret > 0) {
+			*dst_len = ret;
+			ret = 0;
+			break;
+		}
+
+		/* Didn't fit: should we retry with a smaller amount?  */
+		if (*src_len <= *dst_len) {
+			ret = -1;
+			break;
+		}
+
+		/*
+		 * If ret is negative, it's a hint as to how much data would fit
+		 */
+		BUG_ON(-ret >= *src_len);
+
+		if (ret < 0)
+			*src_len = -ret;
+		else
+			*src_len -= (*src_len - *dst_len) / 2;
+		*src_len = round_down(*src_len, block_bytes(c));
+	}
+
+	mempool_free(workspace, &c->compress_workspace[compression_type]);
+
+	if (ret)
+		goto err;
+
+	/* Didn't get smaller: */
+	if (round_up(*dst_len, block_bytes(c)) >= *src_len)
+		goto err;
+
+	pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
+
+	memset(dst_data.b + *dst_len, 0, pad);
+	*dst_len += pad;
+
+	if (dst_data.type != BB_NONE)
+		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
+out:
+	bio_unmap_or_unbounce(c, src_data);
+	bio_unmap_or_unbounce(c, dst_data);
+	return compression_type;
+err:
+	compression_type = 0;
+	goto out;
+}
+
+unsigned bch2_bio_compress(struct bch_fs *c,
+			   struct bio *dst, size_t *dst_len,
+			   struct bio *src, size_t *src_len,
+			   unsigned compression_type)
+{
+	unsigned orig_dst = dst->bi_iter.bi_size;
+	unsigned orig_src = src->bi_iter.bi_size;
+
+	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
+	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
+				     c->sb.encoded_extent_max << 9);
+	/* Don't generate a bigger output than input: */
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+		compression_type = BCH_COMPRESSION_LZ4;
+
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
+
+	dst->bi_iter.bi_size = orig_dst;
+	src->bi_iter.bi_size = orig_src;
+	return compression_type;
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
+#define BCH_FEATURE_NONE	0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+	BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+{
+	int ret = 0;
+
+	if ((c->sb.features & f) == f)
+		return 0;
+
+	mutex_lock(&c->sb_lock);
+
+	if ((c->sb.features & f) == f) {
+		mutex_unlock(&c->sb_lock);
+		return 0;
+	}
+
+	ret = __bch2_fs_compress_init(c, c->sb.features|f);
+	if (ret) {
+		mutex_unlock(&c->sb_lock);
+		return ret;
+	}
+
+	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+				       unsigned compression_type)
+{
+	BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+	return compression_type
+		? __bch2_check_set_has_compressed_data(c,
+				1ULL << bch2_compression_opt_to_feature[compression_type])
+		: 0;
+}
+
+void bch2_fs_compress_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	mempool_exit(&c->decompress_workspace);
+	for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+		mempool_exit(&c->compress_workspace[i]);
+	mempool_exit(&c->compression_bounce[WRITE]);
+	mempool_exit(&c->compression_bounce[READ]);
+}
+
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
+{
+	size_t max_extent = c->sb.encoded_extent_max << 9;
+	size_t order = get_order(max_extent);
+	size_t decompress_workspace_size = 0;
+	bool decompress_workspace_needed;
+	ZSTD_parameters params = zstd_get_params(0, max_extent);
+	struct {
+		unsigned	feature;
+		unsigned	type;
+		size_t		compress_workspace;
+		size_t		decompress_workspace;
+	} compression_types[] = {
+		{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+		{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+			zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+			zlib_inflate_workspacesize(), },
+		{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+			zstd_cctx_workspace_bound(&params.cParams),
+			zstd_dctx_workspace_bound() },
+	}, *i;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	c->zstd_params = params;
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++)
+		if (features & (1 << i->feature))
+			goto have_compressed;
+
+	goto out;
+have_compressed:
+
+	if (!mempool_initialized(&c->compression_bounce[READ])) {
+		ret = mempool_init_page_pool(&c->compression_bounce[READ],
+					     1, order);
+		if (ret)
+			goto out;
+	}
+
+	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
+		ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
+					     1, order);
+		if (ret)
+			goto out;
+	}
+
+	for (i = compression_types;
+	     i < compression_types + ARRAY_SIZE(compression_types);
+	     i++) {
+		decompress_workspace_size =
+			max(decompress_workspace_size, i->decompress_workspace);
+
+		if (!(features & (1 << i->feature)))
+			continue;
+
+		if (i->decompress_workspace)
+			decompress_workspace_needed = true;
+
+		if (mempool_initialized(&c->compress_workspace[i->type]))
+			continue;
+
+		ret = mempool_init_kvpmalloc_pool(
+				&c->compress_workspace[i->type],
+				1, i->compress_workspace);
+		if (ret)
+			goto out;
+	}
+
+	ret = mempool_init_kmalloc_pool(
+			&c->decompress_workspace,
+			1, decompress_workspace_size);
+	if (ret)
+		goto out;
+out:
+	pr_verbose_init(c->opts, "ret %i", ret);
+	return ret;
+}
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+	u64 f = c->sb.features;
+
+	if (c->opts.compression)
+		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
+
+	if (c->opts.background_compression)
+		f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+
+	return __bch2_fs_compress_init(c, f);
+
+}
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+
+#include "extents_types.h"
+
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+				struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
+
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_COMPRESS_H */
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Assorted bcachefs debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "debug.h"
+#include "error.h"
+#include "extents.h"
+#include "fsck.h"
+#include "inode.h"
+#include "io.h"
+#include "super.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *bch_debug;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	struct btree *v = c->verify_data;
+	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
+	struct bset *sorted, *inmemory;
+	struct extent_pick_ptr pick;
+	struct bch_dev *ca;
+	struct bio *bio;
+
+	if (c->opts.nochanges)
+		return;
+
+	btree_node_io_lock(b);
+	mutex_lock(&c->verify_lock);
+
+	n_ondisk = c->verify_ondisk;
+	n_sorted = c->verify_data->data;
+	n_inmemory = b->data;
+
+	bkey_copy(&v->key, &b->key);
+	v->written	= 0;
+	v->level	= b->level;
+	v->btree_id	= b->btree_id;
+	bch2_btree_keys_init(v, &c->expensive_debug_checks);
+
+	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+		return;
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ))
+		return;
+
+	bio = bio_alloc_bioset(ca->disk_sb.bdev,
+			       buf_pages(n_sorted, btree_bytes(c)),
+			       REQ_OP_READ|REQ_META,
+			       GFP_NOIO,
+			       &c->btree_bio);
+	bio->bi_iter.bi_sector	= pick.ptr.offset;
+	bio->bi_iter.bi_size	= btree_bytes(c);
+	bch2_bio_map(bio, n_sorted);
+
+	submit_bio_wait(bio);
+
+	bio_put(bio);
+	percpu_ref_put(&ca->io_ref);
+
+	memcpy(n_ondisk, n_sorted, btree_bytes(c));
+
+	if (bch2_btree_node_read_done(c, v, false))
+		goto out;
+
+	n_sorted = c->verify_data->data;
+	sorted = &n_sorted->keys;
+	inmemory = &n_inmemory->keys;
+
+	if (inmemory->u64s != sorted->u64s ||
+	    memcmp(inmemory->start,
+		   sorted->start,
+		   vstruct_end(inmemory) - (void *) inmemory->start)) {
+		unsigned offset = 0, sectors;
+		struct bset *i;
+		unsigned j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** in memory:\n");
+		bch2_dump_bset(b, inmemory, 0);
+
+		printk(KERN_ERR "*** read back in:\n");
+		bch2_dump_bset(v, sorted, 0);
+
+		while (offset < b->written) {
+			if (!offset ) {
+				i = &n_ondisk->keys;
+				sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
+					c->block_bits;
+			} else {
+				struct btree_node_entry *bne =
+					(void *) n_ondisk + (offset << 9);
+				i = &bne->keys;
+
+				sectors = vstruct_blocks(bne, c->block_bits) <<
+					c->block_bits;
+			}
+
+			printk(KERN_ERR "*** on disk block %u:\n", offset);
+			bch2_dump_bset(b, i, offset);
+
+			offset += sectors;
+		}
+
+		printk(KERN_ERR "*** block %u/%u not written\n",
+		       offset >> c->block_bits, btree_blocks(c));
+
+		for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
+			if (inmemory->_data[j] != sorted->_data[j])
+				break;
+
+		printk(KERN_ERR "b->written %u\n", b->written);
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+out:
+	mutex_unlock(&c->verify_lock);
+	btree_node_io_unlock(b);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: bch_fs refcounting */
+
+struct dump_iter {
+	struct bpos		from;
+	struct bch_fs	*c;
+	enum btree_id		id;
+
+	char			buf[PAGE_SIZE];
+	size_t			bytes;	/* what's currently in buf */
+
+	char __user		*ubuf;	/* destination user buffer */
+	size_t			size;	/* size of requested read */
+	ssize_t			ret;	/* bytes read so far */
+};
+
+static int flush_buf(struct dump_iter *i)
+{
+	if (i->bytes) {
+		size_t bytes = min(i->bytes, i->size);
+		int err = copy_to_user(i->ubuf, i->buf, bytes);
+
+		if (err)
+			return err;
+
+		i->ret	 += bytes;
+		i->ubuf	 += bytes;
+		i->size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+	}
+
+	return 0;
+}
+
+static int bch2_dump_open(struct inode *inode, struct file *file)
+{
+	struct btree_debug *bd = inode->i_private;
+	struct dump_iter *i;
+
+	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->from = POS_MIN;
+	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
+	i->id	= bd->id;
+
+	return 0;
+}
+
+static int bch2_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
+			       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+	k = bch2_btree_iter_peek(&iter);
+
+	while (k.k && !(err = btree_iter_err(k))) {
+		bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
+				      i->buf, sizeof(i->buf), k);
+		i->bytes = strlen(i->buf);
+		BUG_ON(i->bytes >= PAGE_SIZE);
+		i->buf[i->bytes] = '\n';
+		i->bytes++;
+
+		k = bch2_btree_iter_next(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree,
+};
+
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct btree *b;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size || !bkey_cmp(POS_MAX, i->from))
+		return i->ret;
+
+	for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
+		i->bytes = bch2_print_btree_node(i->c, b, i->buf,
+						sizeof(i->buf));
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		/*
+		 * can't easily correctly restart a btree node traversal across
+		 * all nodes, meh
+		 */
+		i->from = bkey_cmp(POS_MAX, b->key.k.p)
+			? bkey_successor(b->key.k.p)
+			: b->key.k.p;
+
+		if (!i->size)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations btree_format_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_btree_formats,
+};
+
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct btree *prev_node = NULL;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (!i->size)
+		return i->ret;
+
+	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(err = btree_iter_err(k))) {
+		struct btree_iter_level *l = &iter.l[0];
+		struct bkey_packed *_k =
+			bch2_btree_node_iter_peek(&l->iter, l->b);
+
+		if (l->b != prev_node) {
+			i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
+							sizeof(i->buf));
+			err = flush_buf(i);
+			if (err)
+				break;
+		}
+		prev_node = l->b;
+
+		i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
+						  sizeof(i->buf));
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		bch2_btree_iter_next(&iter);
+		i->from = iter.pos;
+
+		err = flush_buf(i);
+		if (err)
+			break;
+
+		if (!i->size)
+			break;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return err < 0 ? err : i->ret;
+}
+
+static const struct file_operations bfloat_failed_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_read_bfloat_failed,
+};
+
+void bch2_fs_debug_exit(struct bch_fs *c)
+{
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove_recursive(c->debug);
+}
+
+void bch2_fs_debug_init(struct bch_fs *c)
+{
+	struct btree_debug *bd;
+	char name[100];
+
+	if (IS_ERR_OR_NULL(bch_debug))
+		return;
+
+	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+	c->debug = debugfs_create_dir(name, bch_debug);
+	if (IS_ERR_OR_NULL(c->debug))
+		return;
+
+	for (bd = c->btree_debug;
+	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
+	     bd++) {
+		bd->id = bd - c->btree_debug;
+		bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
+						0400, c->debug, bd,
+						&btree_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-formats",
+			 bch2_btree_ids[bd->id]);
+
+		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
+						       &btree_format_debug_ops);
+
+		snprintf(name, sizeof(name), "%s-bfloat-failed",
+			 bch2_btree_ids[bd->id]);
+
+		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
+						 &bfloat_failed_debug_ops);
+	}
+}
+
+#endif
+
+void bch2_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(bch_debug))
+		debugfs_remove_recursive(bch_debug);
+}
+
+int __init bch2_debug_init(void)
+{
+	int ret = 0;
+
+	bch_debug = debugfs_create_dir("bcachefs", NULL);
+	return ret;
+}
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DEBUG_H
+#define _BCACHEFS_DEBUG_H
+
+#include "bcachefs.h"
+
+struct bio;
+struct btree;
+struct bch_fs;
+
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c)			\
+	{ return bch2_##name || c->name;	}
+BCH_DEBUG_PARAMS_ALWAYS()
+#undef BCH_DEBUG_PARAM
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c)			\
+	{ return bch2_##name || c->name;	}
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
+
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
+
+#else /* DEBUG */
+
+#define BCH_DEBUG_PARAM(name, description)				\
+	static inline bool name(struct bch_fs *c) { return false; }
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+
+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
+
+#define bypass_torture_test(d)		0
+
+#endif
+
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+	if (verify_btree_ondisk(c))
+		__bch2_btree_verify(c, b);
+}
+
+#ifdef CONFIG_DEBUG_FS
+void bch2_fs_debug_exit(struct bch_fs *);
+void bch2_fs_debug_init(struct bch_fs *);
+#else
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
+#endif
+
+void bch2_debug_exit(void);
+int bch2_debug_init(void);
+
+#endif /* _BCACHEFS_DEBUG_H */
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "dirent.h"
+#include "fs.h"
+#include "keylist.h"
+#include "str_hash.h"
+
+#include <linux/dcache.h>
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+{
+	unsigned len = bkey_val_bytes(d.k) -
+		offsetof(struct bch_dirent, d_name);
+
+	while (len && !d.v->d_name[len - 1])
+		--len;
+
+	return len;
+}
+
+static unsigned dirent_val_u64s(unsigned len)
+{
+	return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+			    sizeof(u64));
+}
+
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
+			    const struct qstr *name)
+{
+	struct bch_str_hash_ctx ctx;
+
+	bch2_str_hash_init(&ctx, info);
+	bch2_str_hash_update(&ctx, info, name->name, name->len);
+
+	/* [0,2) reserved for dots */
+	return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
+}
+
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
+{
+	return bch2_dirent_hash(info, key);
+}
+
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+
+	return bch2_dirent_hash(info, &name);
+}
+
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	int len = bch2_dirent_name_bytes(l);
+	const struct qstr *r = _r;
+
+	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+}
+
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
+{
+	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
+	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
+	int l_len = bch2_dirent_name_bytes(l);
+	int r_len = bch2_dirent_name_bytes(r);
+
+	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+}
+
+const struct bch_hash_desc bch2_dirent_hash_desc = {
+	.btree_id	= BTREE_ID_DIRENTS,
+	.key_type	= BCH_DIRENT,
+	.whiteout_type	= BCH_DIRENT_WHITEOUT,
+	.hash_key	= dirent_hash_key,
+	.hash_bkey	= dirent_hash_bkey,
+	.cmp_key	= dirent_cmp_key,
+	.cmp_bkey	= dirent_cmp_bkey,
+};
+
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+	unsigned len;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+			return "value too small";
+
+		d = bkey_s_c_to_dirent(k);
+		len = bch2_dirent_name_bytes(d);
+
+		if (!len)
+			return "empty name";
+
+		/*
+		 * older versions of bcachefs were buggy and creating dirent
+		 * keys that were bigger than necessary:
+		 */
+		if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
+			return "value too big";
+
+		if (len > BCH_NAME_MAX)
+			return "dirent name too big";
+
+		if (memchr(d.v->d_name, '/', len))
+			return "dirent name has invalid characters";
+
+		return NULL;
+	case BCH_DIRENT_WHITEOUT:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_dirent_to_text(struct bch_fs *c, char *buf,
+			 size_t size, struct bkey_s_c k)
+{
+	struct bkey_s_c_dirent d;
+	size_t n = 0;
+
+	switch (k.k->type) {
+	case BCH_DIRENT:
+		d = bkey_s_c_to_dirent(k);
+
+		n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
+				   bch2_dirent_name_bytes(d));
+		n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
+		break;
+	case BCH_DIRENT_WHITEOUT:
+		scnprintf(buf, size, "whiteout");
+		break;
+	}
+}
+
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+				u8 type, const struct qstr *name, u64 dst)
+{
+	struct bkey_i_dirent *dirent;
+	unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+
+	if (name->len > BCH_NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	BUG_ON(u64s > U8_MAX);
+
+	dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(dirent))
+		return dirent;
+
+	bkey_dirent_init(&dirent->k_i);
+	dirent->k.u64s = u64s;
+	dirent->v.d_inum = cpu_to_le64(dst);
+	dirent->v.d_type = type;
+
+	memcpy(dirent->v.d_name, name->name, name->len);
+	memset(dirent->v.d_name + name->len, 0,
+	       bkey_val_bytes(&dirent->k) -
+	       offsetof(struct bch_dirent, d_name) -
+	       name->len);
+
+	EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+
+	return dirent;
+}
+
+int __bch2_dirent_create(struct btree_trans *trans,
+			 u64 dir_inum, const struct bch_hash_info *hash_info,
+			 u8 type, const struct qstr *name, u64 dst_inum,
+			 int flags)
+{
+	struct bkey_i_dirent *dirent;
+	int ret;
+
+	dirent = dirent_create_key(trans, type, name, dst_inum);
+	ret = PTR_ERR_OR_ZERO(dirent);
+	if (ret)
+		return ret;
+
+	return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
+			       dir_inum, &dirent->k_i, flags);
+}
+
+int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       u8 type, const struct qstr *name, u64 dst_inum,
+		       u64 *journal_seq, int flags)
+{
+	return bch2_trans_do(c, journal_seq, flags,
+		__bch2_dirent_create(&trans, dir_inum, hash_info,
+				     type, name, dst_inum, flags));
+}
+
+static void dirent_copy_target(struct bkey_i_dirent *dst,
+			       struct bkey_s_c_dirent src)
+{
+	dst->v.d_inum = src.v->d_inum;
+	dst->v.d_type = src.v->d_type;
+}
+
+static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
+				   const struct qstr *name)
+{
+	return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
+}
+
+int bch2_dirent_rename(struct btree_trans *trans,
+		struct bch_inode_info *src_dir, const struct qstr *src_name,
+		struct bch_inode_info *dst_dir, const struct qstr *dst_name,
+		enum bch_rename_mode mode)
+{
+	struct btree_iter *src_iter, *dst_iter;
+	struct bkey_s_c old_src, old_dst;
+	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
+	struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
+	int ret;
+
+	/*
+	 * Lookup dst:
+	 *
+	 * Note that in BCH_RENAME mode, we're _not_ checking if
+	 * the target already exists - we're relying on the VFS
+	 * to do that check for us for correctness:
+	 */
+	dst_iter = mode == BCH_RENAME
+		? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+				 &dst_dir->ei_str_hash,
+				 dst_dir->v.i_ino, dst_name)
+		: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				   &dst_dir->ei_str_hash,
+				   dst_dir->v.i_ino, dst_name,
+				   BTREE_ITER_INTENT);
+	if (IS_ERR(dst_iter))
+		return PTR_ERR(dst_iter);
+	old_dst = bch2_btree_iter_peek_slot(dst_iter);
+
+	/* Lookup src: */
+	src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+				    &src_dir->ei_str_hash,
+				    src_dir->v.i_ino, src_name,
+				    BTREE_ITER_INTENT);
+	if (IS_ERR(src_iter))
+		return PTR_ERR(src_iter);
+	old_src = bch2_btree_iter_peek_slot(src_iter);
+
+	/* Create new dst key: */
+	new_dst = dirent_create_key(trans, 0, dst_name, 0);
+	if (IS_ERR(new_dst))
+		return PTR_ERR(new_dst);
+
+	dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
+	new_dst->k.p = dst_iter->pos;
+
+	/* Create new src key: */
+	if (mode == BCH_RENAME_EXCHANGE) {
+		new_src = dirent_create_key(trans, 0, src_name, 0);
+		if (IS_ERR(new_src))
+			return PTR_ERR(new_src);
+
+		dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
+		new_src->k.p = src_iter->pos;
+	} else {
+		new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+		if (IS_ERR(new_src))
+			return PTR_ERR(new_src);
+		bkey_init(&new_src->k);
+		new_src->k.p = src_iter->pos;
+
+		if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
+		    bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+			/*
+			 * We have a hash collision for the new dst key,
+			 * and new_src - the key we're deleting - is between
+			 * new_dst's hashed slot and the slot we're going to be
+			 * inserting it into - oops.  This will break the hash
+			 * table if we don't deal with it:
+			 */
+			if (mode == BCH_RENAME) {
+				/*
+				 * If we're not overwriting, we can just insert
+				 * new_dst at the src position:
+				 */
+				new_dst->k.p = src_iter->pos;
+				bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+				return 0;
+			} else {
+				/* If we're overwriting, we can't insert new_dst
+				 * at a different slot because it has to
+				 * overwrite old_dst - just make sure to use a
+				 * whiteout when deleting src:
+				 */
+				new_src->k.type = BCH_DIRENT_WHITEOUT;
+			}
+		} else {
+			/* Check if we need a whiteout to delete src: */
+			ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
+						       &src_dir->ei_str_hash,
+						       src_iter);
+			if (ret < 0)
+				return ret;
+
+			if (ret)
+				new_src->k.type = BCH_DIRENT_WHITEOUT;
+		}
+	}
+
+	bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
+	bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+	return 0;
+}
+
+int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
+			 const struct bch_hash_info *hash_info,
+			 const struct qstr *name)
+{
+	return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
+				dir_inum, name);
+}
+
+int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name,
+		       u64 *journal_seq)
+{
+	return bch2_trans_do(c, journal_seq,
+			     BTREE_INSERT_ATOMIC|
+			     BTREE_INSERT_NOFAIL,
+		__bch2_dirent_delete(&trans, dir_inum, hash_info, name));
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
+		       const struct bch_hash_info *hash_info,
+		       const struct qstr *name)
+{
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c k;
+	u64 inum = 0;
+
+	bch2_trans_init(&trans, c);
+
+	iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
+				hash_info, dir_inum, name, 0);
+	if (IS_ERR(iter)) {
+		BUG_ON(PTR_ERR(iter) == -EINTR);
+		goto out;
+	}
+
+	k = bch2_btree_iter_peek_slot(iter);
+	inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+out:
+	bch2_trans_exit(&trans);
+	return inum;
+}
+
+int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
+		if (k.k->p.inode > dir_inum)
+			break;
+
+		if (k.k->type == BCH_DIRENT) {
+			ret = -ENOTEMPTY;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
+int bch2_readdir(struct bch_fs *c, struct file *file,
+		 struct dir_context *ctx)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_s_c_dirent dirent;
+	unsigned len;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
+			   POS(inode->v.i_ino, ctx->pos), 0, k) {
+		if (k.k->type != BCH_DIRENT)
+			continue;
+
+		dirent = bkey_s_c_to_dirent(k);
+
+		if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
+			continue;
+
+		if (k.k->p.inode > inode->v.i_ino)
+			break;
+
+		len = bch2_dirent_name_bytes(dirent);
+
+		/*
+		 * XXX: dir_emit() can fault and block, while we're holding
+		 * locks
+		 */
+		if (!dir_emit(ctx, dirent.v->d_name, len,
+			      le64_to_cpu(dirent.v->d_inum),
+			      dirent.v->d_type))
+			break;
+
+		ctx->pos = k.k->p.offset + 1;
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return 0;
+}
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_H
+#define _BCACHEFS_DIRENT_H
+
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_dirent_ops (struct bkey_ops) {	\
+	.key_invalid	= bch2_dirent_invalid,		\
+	.val_to_text	= bch2_dirent_to_text,		\
+}
+
+struct qstr;
+struct file;
+struct dir_context;
+struct bch_fs;
+struct bch_hash_info;
+struct bch_inode_info;
+
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+
+int __bch2_dirent_create(struct btree_trans *, u64,
+			 const struct bch_hash_info *, u8,
+			 const struct qstr *, u64, int);
+int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
+		       u8, const struct qstr *, u64, u64 *, int);
+
+int __bch2_dirent_delete(struct btree_trans *, u64,
+			 const struct bch_hash_info *,
+			 const struct qstr *);
+int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
+		       const struct qstr *, u64 *);
+
+enum bch_rename_mode {
+	BCH_RENAME,
+	BCH_RENAME_OVERWRITE,
+	BCH_RENAME_EXCHANGE,
+};
+
+int bch2_dirent_rename(struct btree_trans *,
+		       struct bch_inode_info *, const struct qstr *,
+		       struct bch_inode_info *, const struct qstr *,
+		       enum bch_rename_mode);
+
+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
+		       const struct qstr *);
+
+int bch2_empty_dir(struct bch_fs *, u64);
+int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
+
+#endif /* _BCACHEFS_DIRENT_H */
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@ -0,0 +1,494 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+	const struct bch_disk_group *l = _l;
+	const struct bch_disk_group *r = _r;
+
+	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+		strncmp(l->label, r->label, sizeof(l->label));
+}
+
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+						struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g, *sorted = NULL;
+	struct bch_sb_field_members *mi;
+	struct bch_member *m;
+	unsigned i, nr_groups, len;
+	const char *err = NULL;
+
+	mi		= bch2_sb_get_members(sb);
+	groups		= bch2_sb_get_disk_groups(sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	for (m = mi->members;
+	     m < mi->members + sb->nr_devices;
+	     m++) {
+		unsigned g;
+
+		if (!BCH_MEMBER_GROUP(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m) - 1;
+
+		if (g >= nr_groups ||
+		    BCH_GROUP_DELETED(&groups->entries[g]))
+			return "disk has invalid group";
+	}
+
+	if (!nr_groups)
+		return NULL;
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		len = strnlen(g->label, sizeof(g->label));
+		if (!len) {
+			err = "group with empty label";
+			goto err;
+		}
+	}
+
+	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return "cannot allocate memory";
+
+	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+	for (i = 0; i + 1 < nr_groups; i++)
+		if (!BCH_GROUP_DELETED(sorted + i) &&
+		    !group_cmp(sorted + i, sorted + i + 1)) {
+			err = "duplicate groups";
+			goto err;
+		}
+
+	err = NULL;
+err:
+	kfree(sorted);
+	return err;
+}
+
+static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	char *out = buf, *end = buf + size;
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr_groups = disk_groups_nr(groups);
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (g != groups->entries)
+			out += scnprintf(out, end - out, " ");
+
+		if (BCH_GROUP_DELETED(g))
+			out += scnprintf(out, end - out, "[deleted]");
+		else
+			out += scnprintf(out, end - out,
+					 "[parent %llu name %s]",
+					 BCH_GROUP_PARENT(g),
+					 g->label);
+	}
+
+	return out - buf;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+	.validate	= bch2_sb_disk_groups_validate,
+	.to_text	= bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_sb_field_disk_groups *groups;
+	struct bch_disk_groups_cpu *cpu_g, *old_g;
+	unsigned i, g, nr_groups;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	mi		= bch2_sb_get_members(c->disk_sb.sb);
+	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	if (!groups)
+		return 0;
+
+	cpu_g = kzalloc(sizeof(*cpu_g) +
+			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+	if (!cpu_g)
+		return -ENOMEM;
+
+	cpu_g->nr = nr_groups;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *src	= &groups->entries[i];
+		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
+
+		dst->deleted	= BCH_GROUP_DELETED(src);
+		dst->parent	= BCH_GROUP_PARENT(src);
+	}
+
+	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+		struct bch_disk_group_cpu *dst =
+			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m);
+		while (g) {
+			dst = &cpu_g->entries[g - 1];
+			__set_bit(i, dst->devs.d);
+			g = dst->parent;
+		}
+	}
+
+	old_g = rcu_dereference_protected(c->disk_groups,
+				lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->disk_groups, cpu_g);
+	if (old_g)
+		kfree_rcu(old_g, rcu);
+
+	return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return NULL;
+	case TARGET_DEV: {
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+		return ca ? &ca->self : NULL;
+	}
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+		return t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+	}
+	default:
+		BUG();
+	}
+}
+
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return dev == t.dev;
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g;
+		const struct bch_devs_mask *m;
+		bool ret;
+
+		rcu_read_lock();
+		g = rcu_dereference(c->disk_groups);
+		m = t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+
+		ret = m ? test_bit(dev, m->d) : false;
+		rcu_read_unlock();
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+				  unsigned parent,
+				  const char *name, unsigned namelen)
+{
+	unsigned i, nr_groups = disk_groups_nr(groups);
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		if (!BCH_GROUP_DELETED(g) &&
+		    BCH_GROUP_PARENT(g) == parent &&
+		    strnlen(g->label, sizeof(g->label)) == namelen &&
+		    !memcmp(name, g->label, namelen))
+			return i;
+	}
+
+	return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+				 const char *name, unsigned namelen)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	unsigned i, nr_groups = disk_groups_nr(groups);
+	struct bch_disk_group *g;
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0;
+	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+	     i++)
+		;
+
+	if (i == nr_groups) {
+		unsigned u64s =
+			(sizeof(struct bch_sb_field_disk_groups) +
+			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+			sizeof(u64);
+
+		groups = bch2_sb_resize_disk_groups(sb, u64s);
+		if (!groups)
+			return -ENOSPC;
+
+		nr_groups = disk_groups_nr(groups);
+	}
+
+	BUG_ON(i >= nr_groups);
+
+	g = &groups->entries[i];
+
+	memcpy(g->label, name, namelen);
+	if (namelen < sizeof(g->label))
+		g->label[namelen] = '\0';
+	SET_BCH_GROUP_DELETED(g, 0);
+	SET_BCH_GROUP_PARENT(g, parent);
+	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+	return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		v = __bch2_disk_group_find(groups, v + 1, name, len);
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups;
+	unsigned parent = 0;
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		groups = bch2_sb_get_disk_groups(sb->sb);
+
+		v = __bch2_disk_group_find(groups, parent, name, len);
+		if (v < 0)
+			v = __bch2_disk_group_add(sb, parent, name, len);
+		if (v < 0)
+			return v;
+
+		parent = v + 1;
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_print(struct bch_sb_handle *sb,
+			 char *buf, size_t len, unsigned v)
+{
+	char *out = buf, *end = out + len;
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	struct bch_disk_group *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto inval;
+
+		if (v >= disk_groups_nr(groups))
+			goto inval;
+
+		g = groups->entries + v;
+
+		if (BCH_GROUP_DELETED(g))
+			goto inval;
+
+		path[nr++] = v;
+
+		if (!BCH_GROUP_PARENT(g))
+			break;
+
+		v = BCH_GROUP_PARENT(g) - 1;
+	}
+
+	while (nr) {
+		unsigned b = 0;
+
+		v = path[--nr];
+		g = groups->entries + v;
+
+		if (end != out)
+			b = min_t(size_t, end - out,
+				  strnlen(g->label, sizeof(g->label)));
+		memcpy(out, g->label, b);
+		if (b < end - out)
+			out[b] = '\0';
+		out += b;
+
+		if (nr)
+			out += scnprintf(out, end - out, ".");
+	}
+
+	return out - buf;
+inval:
+	return scnprintf(buf, len, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	struct bch_member *mi;
+	int v = -1;
+
+	mutex_lock(&c->sb_lock);
+
+	if (!strlen(name) || !strcmp(name, "none"))
+		goto write_sb;
+
+	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+	if (v < 0) {
+		mutex_unlock(&c->sb_lock);
+		return v;
+	}
+
+write_sb:
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+	struct bch_dev *ca;
+	int g;
+
+	if (!strlen(buf) || !strcmp(buf, "none")) {
+		*v = 0;
+		return 0;
+	}
+
+	/* Is it a device? */
+	ca = bch2_dev_lookup(c, buf);
+	if (!IS_ERR(ca)) {
+		*v = dev_to_target(ca->dev_idx);
+		percpu_ref_put(&ca->ref);
+		return 0;
+	}
+
+	mutex_lock(&c->sb_lock);
+	g = bch2_disk_path_find(&c->disk_sb, buf);
+	mutex_unlock(&c->sb_lock);
+
+	if (g >= 0) {
+		*v = group_to_target(g);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+	struct target t = target_decode(v);
+	int ret;
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return scnprintf(buf, len, "none");
+	case TARGET_DEV: {
+		struct bch_dev *ca;
+
+		rcu_read_lock();
+		ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+
+		if (ca && percpu_ref_tryget(&ca->io_ref)) {
+			ret = scnprintf(buf, len, "/dev/%pg",
+					ca->disk_sb.bdev);
+			percpu_ref_put(&ca->io_ref);
+		} else if (ca) {
+			ret = scnprintf(buf, len, "offline device %u", t.dev);
+		} else {
+			ret = scnprintf(buf, len, "invalid device %u", t.dev);
+		}
+
+		rcu_read_unlock();
+		break;
+	}
+	case TARGET_GROUP:
+		mutex_lock(&c->sb_lock);
+		ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+		mutex_unlock(&c->sb_lock);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+	return groups
+		? (vstruct_end(&groups->field) -
+		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+		: 0;
+}
+
+struct target {
+	enum {
+		TARGET_NULL,
+		TARGET_DEV,
+		TARGET_GROUP,
+	}			type;
+	union {
+		unsigned	dev;
+		unsigned	group;
+	};
+};
+
+#define TARGET_DEV_START	1
+#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+	return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+	return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+	if (target >= TARGET_GROUP_START)
+		return (struct target) {
+			.type	= TARGET_GROUP,
+			.group	= target - TARGET_GROUP_START
+		};
+
+	if (target >= TARGET_DEV_START)
+		return (struct target) {
+			.type	= TARGET_DEV,
+			.group	= target - TARGET_DEV_START
+		};
+
+	return (struct target) { .type = TARGET_NULL };
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+					 struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "error.h"
+#include "io.h"
+#include "super.h"
+
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+	set_bit(BCH_FS_ERROR, &c->flags);
+
+	switch (c->opts.errors) {
+	case BCH_ON_ERROR_CONTINUE:
+		return false;
+	case BCH_ON_ERROR_RO:
+		if (bch2_fs_emergency_read_only(c))
+			bch_err(c, "emergency read only");
+		return true;
+	case BCH_ON_ERROR_PANIC:
+		panic(bch2_fmt(c, "panic after error"));
+		return true;
+	default:
+		BUG();
+	}
+}
+
+void bch2_fatal_error(struct bch_fs *c)
+{
+	if (bch2_fs_emergency_read_only(c))
+		bch_err(c, "emergency read only");
+}
+
+void bch2_io_error_work(struct work_struct *work)
+{
+	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
+	struct bch_fs *c = ca->fs;
+	bool dev;
+
+	mutex_lock(&c->state_lock);
+	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
+				    BCH_FORCE_IF_DEGRADED);
+	if (dev
+	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
+				  BCH_FORCE_IF_DEGRADED)
+	    : bch2_fs_emergency_read_only(c))
+		bch_err(ca,
+			"too many IO errors, setting %s RO",
+			dev ? "device" : "filesystem");
+	mutex_unlock(&c->state_lock);
+}
+
+void bch2_io_error(struct bch_dev *ca)
+{
+	//queue_work(system_long_wq, &ca->io_error_work);
+}
+
+#ifdef __KERNEL__
+#define ask_yn()	false
+#else
+#include "tools-util.h"
+#endif
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
+				const char *fmt, ...)
+{
+	struct fsck_err_state *s;
+	va_list args;
+	bool fix = false, print = true, suppressing = false;
+	char _buf[sizeof(s->buf)], *buf = _buf;
+
+	mutex_lock(&c->fsck_error_lock);
+
+	if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+		goto print;
+
+	list_for_each_entry(s, &c->fsck_errors, list)
+		if (s->fmt == fmt)
+			goto found;
+
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		if (!c->fsck_alloc_err)
+			bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+		c->fsck_alloc_err = true;
+		buf = _buf;
+		goto print;
+	}
+
+	INIT_LIST_HEAD(&s->list);
+	s->fmt = fmt;
+found:
+	list_move(&s->list, &c->fsck_errors);
+	s->nr++;
+	suppressing	= s->nr == 10;
+	print		= s->nr <= 10;
+	buf		= s->buf;
+print:
+	va_start(args, fmt);
+	vscnprintf(buf, sizeof(_buf), fmt, args);
+	va_end(args);
+
+	if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+		bch_err(c, "%s, exiting", buf);
+		mutex_unlock(&c->fsck_error_lock);
+		return FSCK_ERR_EXIT;
+	}
+
+	if (flags & FSCK_CAN_FIX) {
+		if (c->opts.fix_errors == FSCK_OPT_ASK) {
+			printk(KERN_ERR "%s: fix?", buf);
+			fix = ask_yn();
+		} else if (c->opts.fix_errors == FSCK_OPT_YES ||
+			   (c->opts.nochanges &&
+			    !(flags & FSCK_CAN_IGNORE))) {
+			if (print)
+				bch_err(c, "%s, fixing", buf);
+			fix = true;
+		} else {
+			if (print)
+				bch_err(c, "%s, not fixing", buf);
+			fix = false;
+		}
+	} else if (flags & FSCK_NEED_FSCK) {
+		if (print)
+			bch_err(c, "%s (run fsck to correct)", buf);
+	} else {
+		if (print)
+			bch_err(c, "%s (repair unimplemented)", buf);
+	}
+
+	if (suppressing)
+		bch_err(c, "Ratelimiting new instances of previous error");
+
+	mutex_unlock(&c->fsck_error_lock);
+
+	if (fix)
+		set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+
+	return fix				? FSCK_ERR_FIX
+		: flags & FSCK_CAN_IGNORE	? FSCK_ERR_IGNORE
+						: FSCK_ERR_EXIT;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+	struct fsck_err_state *s, *n;
+
+	mutex_lock(&c->fsck_error_lock);
+	set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+	list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+		if (s->nr > 10)
+			bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->buf);
+
+		list_del(&s->list);
+		kfree(s);
+	}
+
+	mutex_unlock(&c->fsck_error_lock);
+}
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@ -0,0 +1,229 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERROR_H
+#define _BCACHEFS_ERROR_H
+
+#include <linux/list.h>
+#include <linux/printk.h>
+
+struct bch_dev;
+struct bch_fs;
+struct work_struct;
+
+/*
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
+ * superblock as such
+ */
+
+/* Error messages: */
+
+/*
+ * Very fatal logic/inconsistency errors: these indicate that we've majorly
+ * screwed up at runtime, i.e. it's not likely that it was just caused by the
+ * data on disk being inconsistent. These BUG():
+ *
+ * XXX: audit and convert to inconsistent() checks
+ */
+
+#define bch2_fs_bug(c, ...)						\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	BUG();								\
+} while (0)
+
+#define bch2_fs_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		bch2_fs_bug(c, __VA_ARGS__);				\
+} while (0)
+
+/*
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
+ * initial recovery, they don't indicate a bug in the running code - we walk all
+ * the metadata before modifying anything. If they occur at runtime, they
+ * indicate either a bug in the running code or (less likely) data is being
+ * silently corrupted under us.
+ *
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
+ * BCH_ON_ERROR_CONTINUE mode
+ */
+
+bool bch2_inconsistent_error(struct bch_fs *);
+
+#define bch2_fs_inconsistent(c, ...)					\
+({									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_inconsistent_error(c);					\
+})
+
+#define bch2_fs_inconsistent_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_fs_inconsistent(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Later we might want to mark only the particular device inconsistent, not the
+ * entire filesystem:
+ */
+
+#define bch2_dev_inconsistent(ca, ...)					\
+do {									\
+	bch_err(ca, __VA_ARGS__);					\
+	bch2_inconsistent_error((ca)->fs);				\
+} while (0)
+
+#define bch2_dev_inconsistent_on(cond, ca, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
+ * be able to repair:
+ */
+
+enum {
+	BCH_FSCK_OK			= 0,
+	BCH_FSCK_ERRORS_NOT_FIXED	= 1,
+	BCH_FSCK_REPAIR_UNIMPLEMENTED	= 2,
+	BCH_FSCK_REPAIR_IMPOSSIBLE	= 3,
+	BCH_FSCK_UNKNOWN_VERSION	= 4,
+};
+
+enum fsck_err_opts {
+	FSCK_OPT_EXIT,
+	FSCK_OPT_YES,
+	FSCK_OPT_NO,
+	FSCK_OPT_ASK,
+};
+
+enum fsck_err_ret {
+	FSCK_ERR_IGNORE	= 0,
+	FSCK_ERR_FIX	= 1,
+	FSCK_ERR_EXIT	= 2,
+};
+
+struct fsck_err_state {
+	struct list_head	list;
+	const char		*fmt;
+	u64			nr;
+	char			buf[512];
+};
+
+#define FSCK_CAN_FIX		(1 << 0)
+#define FSCK_CAN_IGNORE		(1 << 1)
+#define FSCK_NEED_FSCK		(1 << 2)
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
+				unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...)					\
+({									\
+	int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
+									\
+	if (_fix == FSCK_ERR_EXIT) {					\
+		bch_err(c, "Unable to continue, halting");		\
+		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
+		goto fsck_err;						\
+	}								\
+									\
+	_fix;								\
+})
+
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...)				\
+	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define need_fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+
+#define mustfix_fsck_err(c, ...)					\
+	__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, ...)				\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+
+#define fsck_err(c, ...)						\
+	__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+#define fsck_err_on(cond, c, ...)					\
+	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+
+/*
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
+ * mode - pretty much just due to metadata IO errors:
+ */
+
+void bch2_fatal_error(struct bch_fs *);
+
+#define bch2_fs_fatal_error(c, ...)					\
+do {									\
+	bch_err(c, __VA_ARGS__);					\
+	bch2_fatal_error(c);						\
+} while (0)
+
+#define bch2_fs_fatal_err_on(cond, c, ...)				\
+({									\
+	int _ret = !!(cond);						\
+									\
+	if (_ret)							\
+		bch2_fs_fatal_error(c, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/*
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
+ */
+
+void bch2_io_error_work(struct work_struct *);
+
+/* Does the error handling without logging a message */
+void bch2_io_error(struct bch_dev *);
+
+/* Logs message and handles the error: */
+#define bch2_dev_io_error(ca, fmt, ...)					\
+do {									\
+	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
+		"IO error on %s for " fmt),				\
+		(ca)->name, ##__VA_ARGS__);				\
+	bch2_io_error(ca);						\
+} while (0)
+
+#define bch2_dev_io_err_on(cond, ca, ...)				\
+({									\
+	bool _ret = (cond);						\
+									\
+	if (_ret)							\
+		bch2_dev_io_error(ca, __VA_ARGS__);			\
+	_ret;								\
+})
+
+/* kill? */
+
+#define __bcache_io_error(c, fmt, ...)					\
+	printk_ratelimited(KERN_ERR bch2_fmt(c,				\
+			"IO error: " fmt), ##__VA_ARGS__)
+
+#define bcache_io_error(c, bio, fmt, ...)				\
+do {									\
+	__bcache_io_error(c, fmt, ##__VA_ARGS__);			\
+	(bio)->bi_status = BLK_STS_IOERR;					\
+} while (0)
+
+#endif /* _BCACHEFS_ERROR_H */
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_H
+#define _BCACHEFS_EXTENTS_H
+
+#include "bcachefs.h"
+#include "bkey.h"
+#include "extents_types.h"
+
+struct bch_fs;
+struct journal_res;
+struct btree_node_iter;
+struct btree_node_iter_large;
+struct btree_insert;
+struct btree_insert_entry;
+struct extent_insert_hook;
+struct bch_devs_mask;
+union bch_extent_crc;
+
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+			       struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_btree_ops (struct bkey_ops) {			\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
+				    struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_extent_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_extent_invalid,			\
+	.key_debugcheck	= bch2_extent_debugcheck,		\
+	.val_to_text	= bch2_extent_to_text,			\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_ptr_normalize,			\
+	.key_merge	= bch2_extent_merge,			\
+	.is_extents	= true,					\
+}
+
+struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
+						  struct btree *,
+						  struct btree_node_iter_large *);
+struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
+						     struct bset *,
+						     struct btree *,
+						     struct btree_node_iter_large *);
+
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *);
+
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+			 struct bch_devs_mask *,
+			 struct extent_pick_ptr *);
+
+enum btree_insert_ret
+bch2_insert_fixup_extent(struct btree_insert *,
+			struct btree_insert_entry *);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+				      unsigned, unsigned);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+
+unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
+unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+unsigned bch2_extent_ptr_durability(struct bch_fs *,
+				    const struct bch_extent_ptr *);
+unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+			     struct bch_extent_ptr, u64);
+
+static inline bool bkey_extent_is_data(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
+{
+	switch (k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+	case BCH_RESERVATION:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+	return bkey_extent_is_allocation(k.k) &&
+		!bch2_extent_is_compressed(k);
+}
+
+static inline bool bkey_extent_is_cached(const struct bkey *k)
+{
+	return k->type == BCH_EXTENT_CACHED;
+}
+
+static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
+{
+	EBUG_ON(k->type != BCH_EXTENT &&
+		k->type != BCH_EXTENT_CACHED);
+
+	k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
+}
+
+static inline unsigned
+__extent_entry_type(const union bch_extent_entry *e)
+{
+	return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
+}
+
+static inline enum bch_extent_entry_type
+extent_entry_type(const union bch_extent_entry *e)
+{
+	int ret = __ffs(e->type);
+
+	EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
+
+	return ret;
+}
+
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
+{
+	switch (extent_entry_type(entry)) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return sizeof(struct bch_extent_crc32);
+	case BCH_EXTENT_ENTRY_crc64:
+		return sizeof(struct bch_extent_crc64);
+	case BCH_EXTENT_ENTRY_crc128:
+		return sizeof(struct bch_extent_crc128);
+	case BCH_EXTENT_ENTRY_ptr:
+		return sizeof(struct bch_extent_ptr);
+	default:
+		BUG();
+	}
+}
+
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
+{
+	return extent_entry_bytes(entry) / sizeof(u64);
+}
+
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
+{
+	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
+{
+	return !extent_entry_is_ptr(e);
+}
+
+union bch_extent_crc {
+	u8				type;
+	struct bch_extent_crc32		crc32;
+	struct bch_extent_crc64		crc64;
+	struct bch_extent_crc128	crc128;
+};
+
+/* downcast, preserves const */
+#define to_entry(_entry)						\
+({									\
+	BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) &&	\
+		     !type_is(_entry, struct bch_extent_ptr *));	\
+									\
+	__builtin_choose_expr(						\
+		(type_is_exact(_entry, const union bch_extent_crc *) ||	\
+		 type_is_exact(_entry, const struct bch_extent_ptr *)),	\
+		(const union bch_extent_entry *) (_entry),		\
+		(union bch_extent_entry *) (_entry));			\
+})
+
+#define __entry_to_crc(_entry)						\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const union bch_extent_crc *) (_entry),		\
+		(union bch_extent_crc *) (_entry))
+
+#define entry_to_crc(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_crc(_entry));		\
+									\
+	__entry_to_crc(_entry);						\
+})
+
+#define entry_to_ptr(_entry)						\
+({									\
+	EBUG_ON((_entry) && !extent_entry_is_ptr(_entry));		\
+									\
+	__builtin_choose_expr(						\
+		type_is_exact(_entry, const union bch_extent_entry *),	\
+		(const struct bch_extent_ptr *) (_entry),		\
+		(struct bch_extent_ptr *) (_entry));			\
+})
+
+/* checksum entries: */
+
+enum bch_extent_crc_type {
+	BCH_EXTENT_CRC_NONE,
+	BCH_EXTENT_CRC32,
+	BCH_EXTENT_CRC64,
+	BCH_EXTENT_CRC128,
+};
+
+static inline enum bch_extent_crc_type
+__extent_crc_type(const union bch_extent_crc *crc)
+{
+	if (!crc)
+		return BCH_EXTENT_CRC_NONE;
+
+	switch (extent_entry_type(to_entry(crc))) {
+	case BCH_EXTENT_ENTRY_crc32:
+		return BCH_EXTENT_CRC32;
+	case BCH_EXTENT_ENTRY_crc64:
+		return BCH_EXTENT_CRC64;
+	case BCH_EXTENT_ENTRY_crc128:
+		return BCH_EXTENT_CRC128;
+	default:
+		BUG();
+	}
+}
+
+#define extent_crc_type(_crc)						\
+({									\
+	BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) &&	\
+		     !type_is(_crc, struct bch_extent_crc64 *) &&	\
+		     !type_is(_crc, struct bch_extent_crc128 *) &&	\
+		     !type_is(_crc, union bch_extent_crc *));		\
+									\
+	  type_is(_crc, struct bch_extent_crc32 *)  ? BCH_EXTENT_CRC32	\
+	: type_is(_crc, struct bch_extent_crc64 *)  ? BCH_EXTENT_CRC64	\
+	: type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128	\
+	: __extent_crc_type((union bch_extent_crc *) _crc);		\
+})
+
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+	case BCH_EXTENT_CRC32: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+		};
+
+		*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+		memcpy(&ret.csum.lo, &crc->crc32.csum,
+		       sizeof(crc->crc32.csum));
+
+		return ret;
+	}
+	case BCH_EXTENT_CRC64: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
+		};
+
+		*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+		return ret;
+	}
+	case BCH_EXTENT_CRC128: {
+		struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+
+		return ret;
+	}
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+/* Extent entry iteration: */
+
+#define extent_entry_next(_entry)					\
+	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
+#define extent_entry_last(_e)						\
+	vstruct_idx((_e).v, bkey_val_u64s((_e).k))
+
+/* Iterate over all entries: */
+
+#define extent_for_each_entry_from(_e, _entry, _start)			\
+	for ((_entry) = _start;						\
+	     (_entry) < extent_entry_last(_e);				\
+	     (_entry) = extent_entry_next(_entry))
+
+#define extent_for_each_entry(_e, _entry)				\
+	extent_for_each_entry_from(_e, _entry, (_e).v->start)
+
+/* Iterate over crcs only: */
+
+#define __extent_crc_next(_e, _p)					\
+({									\
+	typeof(&(_e).v->start[0]) _entry = _p;				\
+									\
+	while ((_entry) < extent_entry_last(_e) &&			\
+	       !extent_entry_is_crc(_entry))				\
+		(_entry) = extent_entry_next(_entry);			\
+									\
+	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
+})
+
+#define __extent_for_each_crc(_e, _crc)					\
+	for ((_crc) = __extent_crc_next(_e, (_e).v->start);		\
+	     (_crc);							\
+	     (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+#define extent_crc_next(_e, _crc, _iter)				\
+({									\
+	extent_for_each_entry_from(_e, _iter, _iter)			\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+			break;						\
+		}							\
+									\
+	(_iter) < extent_entry_last(_e);				\
+})
+
+#define extent_for_each_crc(_e, _crc, _iter)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_iter) = (_e).v->start;					\
+	     extent_crc_next(_e, _crc, _iter);				\
+	     (_iter) = extent_entry_next(_iter))
+
+/* Iterate over pointers, with crcs: */
+
+#define extent_ptr_crc_next(_e, _ptr, _crc)				\
+({									\
+	__label__ out;							\
+	typeof(&(_e).v->start[0]) _entry;				\
+									\
+	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
+		if (extent_entry_is_crc(_entry)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
+		} else {						\
+			_ptr = entry_to_ptr(_entry);			\
+			goto out;					\
+		}							\
+									\
+	_ptr = NULL;							\
+out:									\
+	_ptr;								\
+})
+
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));		\
+	     (_ptr)++)
+
+/* Iterate over pointers only, and from a given position: */
+
+#define extent_ptr_next(_e, _ptr)					\
+({									\
+	struct bch_extent_crc_unpacked _crc;				\
+									\
+	extent_ptr_crc_next(_e, _ptr, _crc);				\
+})
+
+#define extent_for_each_ptr(_e, _ptr)					\
+	for ((_ptr) = &(_e).v->start->ptr;				\
+	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
+	     (_ptr)++)
+
+#define extent_ptr_prev(_e, _ptr)					\
+({									\
+	typeof(&(_e).v->start->ptr) _p;					\
+	typeof(&(_e).v->start->ptr) _prev = NULL;			\
+									\
+	extent_for_each_ptr(_e, _p) {					\
+		if (_p == (_ptr))					\
+			break;						\
+		_prev = _p;						\
+	}								\
+									\
+	_prev;								\
+})
+
+/*
+ * Use this when you'll be dropping pointers as you iterate. Quadratic,
+ * unfortunately:
+ */
+#define extent_for_each_ptr_backwards(_e, _ptr)				\
+	for ((_ptr) = extent_ptr_prev(_e, NULL);			\
+	     (_ptr);							\
+	     (_ptr) = extent_ptr_prev(_e, _ptr))
+
+void bch2_extent_crc_append(struct bkey_i_extent *,
+			    struct bch_extent_crc_unpacked);
+
+static inline void __extent_entry_push(struct bkey_i_extent *e)
+{
+	union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
+
+	EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
+		BKEY_EXTENT_VAL_U64s_MAX);
+
+	e->k.u64s += extent_entry_u64s(entry);
+}
+
+static inline void extent_ptr_append(struct bkey_i_extent *e,
+				     struct bch_extent_ptr ptr)
+{
+	ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+	extent_entry_last(extent_i_to_s(e))->ptr = ptr;
+	__extent_entry_push(e);
+}
+
+static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+{
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr(e, ptr)
+		if (ptr->cached)
+			ret.devs[ret.nr++] = ptr->dev;
+
+	return ret;
+}
+
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+	default:
+		return (struct bch_devs_list) { .nr = 0 };
+	}
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+				 struct bch_extent_crc_unpacked);
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
+
+void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
+
+bool bch2_cut_front(struct bpos, struct bkey_i *);
+bool bch2_cut_back(struct bpos, struct bkey *);
+void bch2_key_resize(struct bkey *, unsigned);
+
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
+#endif /* _BCACHEFS_EXTENTS_H */
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			compressed_size;
+	u16			uncompressed_size;
+
+	u16			offset;
+	u16			live_size;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_pick_ptr {
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_crc_unpacked	crc;
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _EYTZINGER_H
+#define _EYTZINGER_H
+
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "util.h"
+
+/*
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
+ * array
+ */
+
+/*
+ * One based indexing version:
+ *
+ * With one based indexing each level of the tree starts at a power of two -
+ * good for cacheline alignment:
+ *
+ * Size parameter is treated as if we were using 0 based indexing, however:
+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
+ * are actually size - 1 elements
+ */
+
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + child;
+}
+
+static inline unsigned eytzinger1_left_child(unsigned i)
+{
+	return eytzinger1_child(i, 0);
+}
+
+static inline unsigned eytzinger1_right_child(unsigned i)
+{
+	return eytzinger1_child(i, 1);
+}
+
+static inline unsigned eytzinger1_first(unsigned size)
+{
+	return rounddown_pow_of_two(size - 1);
+}
+
+static inline unsigned eytzinger1_last(unsigned size)
+{
+	return rounddown_pow_of_two(size) - 1;
+}
+
+/*
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
+ *
+ * eytzinger1_next(0) == eytzinger1_first())
+ * eytzinger1_prev(0) == eytzinger1_last())
+ *
+ * eytzinger1_prev(eytzinger1_first()) == 0
+ * eytzinger1_next(eytzinger1_last()) == 0
+ */
+
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
+{
+	EBUG_ON(i >= size);
+
+	if (eytzinger1_right_child(i) < size) {
+		i = eytzinger1_right_child(i);
+
+		i <<= __fls(size) - __fls(i);
+		i >>= i >= size;
+	} else {
+		i >>= ffz(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
+{
+	EBUG_ON(i >= size);
+
+	if (eytzinger1_left_child(i) < size) {
+		i = eytzinger1_left_child(i) + 1;
+
+		i <<= __fls(size) - __fls(i);
+		i -= 1;
+		i >>= i >= size;
+	} else {
+		i >>= __ffs(i) + 1;
+	}
+
+	return i;
+}
+
+static inline unsigned eytzinger1_extra(unsigned size)
+{
+	return (size - rounddown_pow_of_two(size - 1)) << 1;
+}
+
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
+					      unsigned extra)
+{
+	unsigned b = __fls(i);
+	unsigned shift = __fls(size - 1) - b;
+	int s;
+
+	EBUG_ON(!i || i >= size);
+
+	i  ^= 1U << b;
+	i <<= 1;
+	i  |= 1;
+	i <<= shift;
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i -= (i - extra) >> 1;
+	 */
+	s = extra - i;
+	i += (s >> 1) & (s >> 31);
+
+	return i;
+}
+
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	unsigned shift;
+	int s;
+
+	EBUG_ON(!i || i >= size);
+
+	/*
+	 * sign bit trick:
+	 *
+	 * if (i > extra)
+	 *	i += i - extra;
+	 */
+	s = extra - i;
+	i -= s & (s >> 31);
+
+	shift = __ffs(i);
+
+	i >>= shift + 1;
+	i  |= 1U << (__fls(size - 1) - shift);
+
+	return i;
+}
+
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
+}
+
+#define eytzinger1_for_each(_i, _size)			\
+	for ((_i) = eytzinger1_first((_size));		\
+	     (_i) != 0;					\
+	     (_i) = eytzinger1_next((_i), (_size)))
+
+/* Zero based indexing version: */
+
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
+{
+	EBUG_ON(child > 1);
+
+	return (i << 1) + 1 + child;
+}
+
+static inline unsigned eytzinger0_left_child(unsigned i)
+{
+	return eytzinger0_child(i, 0);
+}
+
+static inline unsigned eytzinger0_right_child(unsigned i)
+{
+	return eytzinger0_child(i, 1);
+}
+
+static inline unsigned eytzinger0_first(unsigned size)
+{
+	return eytzinger1_first(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_last(unsigned size)
+{
+	return eytzinger1_last(size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
+{
+	return eytzinger1_next(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
+{
+	return eytzinger1_prev(i + 1, size + 1) - 1;
+}
+
+static inline unsigned eytzinger0_extra(unsigned size)
+{
+	return eytzinger1_extra(size + 1);
+}
+
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
+					       unsigned extra)
+{
+	return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+}
+
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
+{
+	return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
+}
+
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
+{
+	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
+}
+
+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
+
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
+static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
+				     eytzinger_cmp_fn cmp, const void *search)
+{
+	size_t i = 0;
+	int res;
+
+	while (i < nr &&
+	       (res = cmp(search, base + i * size, size)))
+		i = eytzinger0_child(i, res > 0);
+
+	return i;
+}
+
+void eytzinger0_sort(void *, size_t, size_t,
+		    int (*cmp_func)(const void *, const void *, size_t),
+		    void (*swap_func)(void *, void *, size_t));
+
+#endif /* _EYTZINGER_H */
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FIFO_H
+#define _BCACHEFS_FIFO_H
+
+#include "util.h"
+
+#define FIFO(type)							\
+struct {								\
+	size_t front, back, size, mask;					\
+	type *data;							\
+}
+
+#define DECLARE_FIFO(type, name)	FIFO(type) name
+
+#define fifo_buf_size(fifo)						\
+	(roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
+
+#define init_fifo(fifo, _size, _gfp)					\
+({									\
+	(fifo)->front	= (fifo)->back = 0;				\
+	(fifo)->size	= (_size);					\
+	(fifo)->mask	= (fifo)->size					\
+		? roundup_pow_of_two((fifo)->size) - 1			\
+		: 0;							\
+	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front))
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	((fifo)->front == (fifo)->back)
+#define fifo_full(fifo)		(fifo_used(fifo) == (fifo)->size)
+
+#define fifo_peek_front(fifo)	((fifo)->data[(fifo)->front & (fifo)->mask])
+#define fifo_peek_back(fifo)	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_entry_idx_abs(fifo, p)					\
+	((((p) >= &fifo_peek_front(fifo)				\
+	   ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) +		\
+	   (((p) - (fifo)->data)))
+
+#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
+#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
+
+#define fifo_push_back_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
+
+#define fifo_push_front_ref(f)						\
+	(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
+
+#define fifo_push_back(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_back_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_push_front(fifo, new)					\
+({									\
+	typeof((fifo)->data) _r = fifo_push_front_ref(fifo);		\
+	if (_r)								\
+		*_r = (new);						\
+	_r != NULL;							\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask];	\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r)								\
+		(i) = (fifo)->data[--(fifo)->back & (fifo)->mask]	\
+	_r;								\
+})
+
+#define fifo_push_ref(fifo)	fifo_push_back_ref(fifo)
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+#define fifo_peek(fifo)		fifo_peek_front(fifo)
+
+#define fifo_for_each_entry(_entry, _fifo, _iter)			\
+	for (((void) (&(_iter) == &(_fifo)->front)),			\
+	     _iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)			\
+	for (((void) (&(_iter) == &(_fifo)->front)),			\
+	     _iter = (_fifo)->front;					\
+	     ((_iter != (_fifo)->back) &&				\
+	      (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));	\
+	     _iter++)
+
+#endif /* _BCACHEFS_FIFO_H */
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_H
+#define _BCACHEFS_FS_IO_H
+
+#ifndef NO_BCACHEFS_FS
+
+#include "buckets.h"
+#include "io_types.h"
+
+#include <linux/uio.h>
+
+bool bch2_dirty_folio(struct address_space *, struct folio *);
+
+int bch2_writepage(struct page *, struct writeback_control *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+		     unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+		   unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+int bch2_fsync(struct file *, loff_t, loff_t, int);
+
+int bch2_truncate(struct bch_inode_info *, struct iattr *);
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+
+loff_t bch2_llseek(struct file *, loff_t, int);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_H */
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "chardev.h"
+#include "fs.h"
+#include "fs-ioctl.h"
+#include "quota.h"
+
+#include <linux/compat.h>
+#include <linux/mount.h>
+
+#define FS_IOC_GOINGDOWN	     _IOR('X', 125, __u32)
+
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const unsigned bch_flags_to_vfs[] = {
+	[__BCH_INODE_SYNC]	= S_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= S_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= S_APPEND,
+	[__BCH_INODE_NOATIME]	= S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const unsigned bch_flags_to_uflags[] = {
+	[__BCH_INODE_SYNC]	= FS_SYNC_FL,
+	[__BCH_INODE_IMMUTABLE]	= FS_IMMUTABLE_FL,
+	[__BCH_INODE_APPEND]	= FS_APPEND_FL,
+	[__BCH_INODE_NODUMP]	= FS_NODUMP_FL,
+	[__BCH_INODE_NOATIME]	= FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const unsigned bch_flags_to_xflags[] = {
+	[__BCH_INODE_SYNC]	= FS_XFLAG_SYNC,
+	[__BCH_INODE_IMMUTABLE]	= FS_XFLAG_IMMUTABLE,
+	[__BCH_INODE_APPEND]	= FS_XFLAG_APPEND,
+	[__BCH_INODE_NODUMP]	= FS_XFLAG_NODUMP,
+	[__BCH_INODE_NOATIME]	= FS_XFLAG_NOATIME,
+	//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)					\
+do {									\
+	unsigned _i;							\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & (1 << _i))					\
+			(_out) |= _map[_i];				\
+		else							\
+			(_out) &= ~_map[_i];				\
+} while (0)
+
+#define map_flags(_map, _in)						\
+({									\
+	unsigned _out = 0;						\
+									\
+	set_flags(_map, _in, _out);					\
+	_out;								\
+})
+
+#define map_flags_rev(_map, _in)					\
+({									\
+	unsigned _i, _out = 0;						\
+									\
+	for (_i = 0; _i < ARRAY_SIZE(_map); _i++)			\
+		if ((_in) & _map[_i]) {					\
+			(_out) |= 1 << _i;				\
+			(_in) &= ~_map[_i];				\
+		}							\
+	(_out);								\
+})
+
+#define map_defined(_map)						\
+({									\
+	unsigned _in = ~0;						\
+									\
+	map_flags_rev(_map, _in);					\
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+	set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
+
+struct flags_set {
+	unsigned		mask;
+	unsigned		flags;
+
+	unsigned		projid;
+};
+
+static int bch2_inode_flags_set(struct bch_inode_info *inode,
+				struct bch_inode_unpacked *bi,
+				void *p)
+{
+	/*
+	 * We're relying on btree locking here for exclusion with other ioctl
+	 * calls - use the flags in the btree (@bi), not inode->i_flags:
+	 */
+	struct flags_set *s = p;
+	unsigned newflags = s->flags;
+	unsigned oldflags = bi->bi_flags & s->mask;
+
+	if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	if (!S_ISREG(inode->v.i_mode) &&
+	    !S_ISDIR(inode->v.i_mode) &&
+	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+		return -EINVAL;
+
+	bi->bi_flags &= ~s->mask;
+	bi->bi_flags |= newflags;
+	inode_set_ctime_current(&inode->v);
+	return 0;
+}
+
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
+{
+	unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
+
+	return put_user(flags, arg);
+}
+
+static int bch2_ioc_setflags(struct bch_fs *c,
+			     struct file *file,
+			     struct bch_inode_info *inode,
+			     void __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
+	unsigned uflags;
+	int ret;
+
+	if (get_user(uflags, (int __user *) arg))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
+	if (uflags)
+		return -EOPNOTSUPP;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto setflags_out;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
+
+	if (!ret)
+		bch2_inode_flags_to_vfs(inode);
+	mutex_unlock(&inode->ei_update_lock);
+
+setflags_out:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct fsxattr fa = { 0 };
+
+	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
+	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+
+	return copy_to_user(arg, &fa, sizeof(fa));
+}
+
+static int bch2_set_projid(struct bch_fs *c,
+			   struct bch_inode_info *inode,
+			   u32 projid)
+{
+	struct bch_qid qid = inode->ei_qid;
+	int ret;
+
+	if (projid == inode->ei_qid.q[QTYP_PRJ])
+		return 0;
+
+	qid.q[QTYP_PRJ] = projid;
+
+	return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
+				   inode->v.i_blocks +
+				   inode->ei_quota_reserved);
+	if (ret)
+		return ret;
+
+	inode->ei_qid.q[QTYP_PRJ] = projid;
+	return 0;
+}
+
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+				      struct bch_inode_unpacked *bi,
+				      void *p)
+{
+	struct flags_set *s = p;
+
+	bi->bi_project = s->projid;
+
+	return bch2_inode_flags_set(inode, bi, p);
+}
+
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
+			       struct file *file,
+			       struct bch_inode_info *inode,
+			       struct fsxattr __user *arg)
+{
+	struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
+	struct fsxattr fa;
+	int ret;
+
+	if (copy_from_user(&fa, arg, sizeof(fa)))
+		return -EFAULT;
+
+	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
+	if (fa.fsx_xflags)
+		return -EOPNOTSUPP;
+
+	s.projid = fa.fsx_projid;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	inode_lock(&inode->v);
+	if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
+		ret = -EACCES;
+		goto err;
+	}
+
+	mutex_lock(&inode->ei_update_lock);
+	ret = bch2_set_projid(c, inode, fa.fsx_projid);
+	if (ret)
+		goto err_unlock;
+
+	ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
+	if (!ret)
+		bch2_inode_flags_to_vfs(inode);
+err_unlock:
+	mutex_unlock(&inode->ei_update_lock);
+err:
+	inode_unlock(&inode->v);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	struct bch_inode_info *inode = file_bch_inode(file);
+	struct super_block *sb = inode->v.i_sb;
+	struct bch_fs *c = sb->s_fs_info;
+
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return bch2_ioc_getflags(inode, (int __user *) arg);
+
+	case FS_IOC_SETFLAGS:
+		return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
+
+	case FS_IOC_FSGETXATTR:
+		return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
+	case FS_IOC_FSSETXATTR:
+		return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
+
+	case FS_IOC_GETVERSION:
+		return -ENOTTY;
+	case FS_IOC_SETVERSION:
+		return -ENOTTY;
+
+	case FS_IOC_GOINGDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		down_write(&sb->s_umount);
+		sb->s_flags |= SB_RDONLY;
+		bch2_fs_emergency_read_only(c);
+		up_write(&sb->s_umount);
+		return 0;
+
+	default:
+		return bch2_fs_ioctl(c, cmd, (void __user *) arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
+}
+#endif
+
+#endif /* NO_BCACHEFS_FS */
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+
+void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+
+#endif /* _BCACHEFS_FS_IOCTL_H */
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_H
+#define _BCACHEFS_FS_H
+
+#include "opts.h"
+#include "str_hash.h"
+#include "quota_types.h"
+
+#include <linux/seqlock.h>
+#include <linux/stat.h>
+
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+	atomic_long_t		v;
+	wait_queue_head_t	wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+	atomic_long_set(&lock->v, 0);
+	init_waitqueue_head(&lock->wait);
+}
+
+void bch2_pagecache_add_put(struct pagecache_lock *);
+void bch2_pagecache_add_get(struct pagecache_lock *);
+void bch2_pagecache_block_put(struct pagecache_lock *);
+void bch2_pagecache_block_get(struct pagecache_lock *);
+
+struct bch_inode_info {
+	struct inode		v;
+
+	struct mutex		ei_update_lock;
+	u64			ei_journal_seq;
+	u64			ei_quota_reserved;
+	unsigned long		ei_last_dirtied;
+	struct pagecache_lock	ei_pagecache_lock;
+
+	struct mutex		ei_quota_lock;
+	struct bch_qid		ei_qid;
+
+	struct bch_hash_info	ei_str_hash;
+
+	/* copy of inode in btree: */
+	struct bch_inode_unpacked ei_inode;
+};
+
+#define to_bch_ei(_inode)					\
+	container_of_or_null(_inode, struct bch_inode_info, v)
+
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
+{
+	return to_bch_ei(file_inode(file));
+}
+
+static inline u8 mode_to_type(umode_t mode)
+{
+	return (mode >> 12) & 15;
+}
+
+static inline unsigned nlink_bias(umode_t mode)
+{
+	return S_ISDIR(mode) ? 2 : 1;
+}
+
+struct bch_inode_unpacked;
+
+#ifndef NO_BCACHEFS_FS
+
+/* returns 0 if we want to do the update, or error is passed up */
+typedef int (*inode_set_fn)(struct bch_inode_info *,
+			    struct bch_inode_unpacked *, void *);
+
+void bch2_inode_update_after_write(struct bch_fs *,
+				   struct bch_inode_info *,
+				   struct bch_inode_unpacked *,
+				   unsigned);
+int __must_check bch2_write_inode_trans(struct btree_trans *,
+				struct bch_inode_info *,
+				struct bch_inode_unpacked *,
+				inode_set_fn, void *);
+int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+				    inode_set_fn, void *, unsigned);
+int __must_check bch2_write_inode(struct bch_fs *,
+				  struct bch_inode_info *);
+
+void bch2_vfs_exit(void);
+int bch2_vfs_init(void);
+
+#else
+
+static inline void bch2_vfs_exit(void) {}
+static inline int bch2_vfs_init(void) { return 0; }
+
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_FS_H */
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FSCK_H
+#define _BCACHEFS_FSCK_H
+
+s64 bch2_count_inode_sectors(struct bch_fs *, u64);
+int bch2_fsck(struct bch_fs *);
+
+#endif /* _BCACHEFS_FSCK_H */
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_update.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "io.h"
+#include "keylist.h"
+
+#include <linux/random.h>
+
+#include <asm/unaligned.h>
+
+#define FIELD_BYTES()						\
+
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
+static const u8 bits_table[8] = {
+	1  * 8 - 1,
+	2  * 8 - 2,
+	3  * 8 - 3,
+	4  * 8 - 4,
+	6  * 8 - 5,
+	8  * 8 - 6,
+	10 * 8 - 7,
+	13 * 8 - 8,
+};
+
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
+{
+	__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
+	unsigned shift, bytes, bits = likely(!hi)
+		? fls64(lo)
+		: fls64(hi) + 64;
+
+	for (shift = 1; shift <= 8; shift++)
+		if (bits < bits_table[shift - 1])
+			goto got_shift;
+
+	BUG();
+got_shift:
+	bytes = byte_table[shift - 1];
+
+	BUG_ON(out + bytes > end);
+
+	memcpy(out, (u8 *) in + 16 - bytes, bytes);
+	*out |= (1 << 8) >> shift;
+
+	return bytes;
+}
+
+static int inode_decode_field(const u8 *in, const u8 *end,
+			      u64 out[2], unsigned *out_bits)
+{
+	__be64 be[2] = { 0, 0 };
+	unsigned bytes, shift;
+	u8 *p;
+
+	if (in >= end)
+		return -1;
+
+	if (!*in)
+		return -1;
+
+	/*
+	 * position of highest set bit indicates number of bytes:
+	 * shift = number of bits to remove in high byte:
+	 */
+	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
+	bytes	= byte_table[shift - 1];
+
+	if (in + bytes > end)
+		return -1;
+
+	p = (u8 *) be + 16 - bytes;
+	memcpy(p, in, bytes);
+	*p ^= (1 << 8) >> shift;
+
+	out[0] = be64_to_cpu(be[0]);
+	out[1] = be64_to_cpu(be[1]);
+	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
+
+	return bytes;
+}
+
+void bch2_inode_pack(struct bkey_inode_buf *packed,
+		     const struct bch_inode_unpacked *inode)
+{
+	u8 *out = packed->inode.v.fields;
+	u8 *end = (void *) &packed[1];
+	u8 *last_nonzero_field = out;
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+
+	bkey_inode_init(&packed->inode.k_i);
+	packed->inode.k.p.inode		= inode->bi_inum;
+	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
+	packed->inode.v.bi_flags	= cpu_to_le32(inode->bi_flags);
+	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)					\
+	out += inode_encode_field(out, end, 0, inode->_name);		\
+	nr_fields++;							\
+									\
+	if (inode->_name) {						\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	}
+
+	BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+	out = last_nonzero_field;
+	nr_fields = last_nonzero_fieldnr;
+
+	set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
+	memset(out, 0,
+	       (u8 *) &packed->inode.v +
+	       bkey_val_bytes(&packed->inode.k) - out);
+
+	SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		struct bch_inode_unpacked unpacked;
+
+		int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+					   &unpacked);
+		BUG_ON(ret);
+		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
+		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
+		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)	BUG_ON(unpacked._name != inode->_name);
+		BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+	}
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+		      struct bch_inode_unpacked *unpacked)
+{
+	const u8 *in = inode.v->fields;
+	const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+	u64 field[2];
+	unsigned fieldnr = 0, field_bits;
+	int ret;
+
+	unpacked->bi_inum	= inode.k->p.inode;
+	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
+	unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
+	unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
+
+#define BCH_INODE_FIELD(_name, _bits)					\
+	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
+		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
+		memset((void *) unpacked + offset, 0,			\
+		       sizeof(*unpacked) - offset);			\
+		return 0;						\
+	}								\
+									\
+	ret = inode_decode_field(in, end, field, &field_bits);		\
+	if (ret < 0)							\
+		return ret;						\
+									\
+	if (field_bits > sizeof(unpacked->_name) * 8)			\
+		return -1;						\
+									\
+	unpacked->_name = field[1];					\
+	in += ret;
+
+	BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+
+	/* XXX: signal if there were more fields than expected? */
+
+	return 0;
+}
+
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	if (k.k->p.offset)
+		return "nonzero offset";
+
+	switch (k.k->type) {
+	case BCH_INODE_FS: {
+		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+		struct bch_inode_unpacked unpacked;
+
+		if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+			return "incorrect value size";
+
+		if (k.k->p.inode < BLOCKDEV_INODE_MAX)
+			return "fs inode in blockdev range";
+
+		if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+			return "invalid str hash type";
+
+		if (bch2_inode_unpack(inode, &unpacked))
+			return "invalid variable length fields";
+
+		if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+			return "invalid data checksum type";
+
+		if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+			return "invalid data checksum type";
+
+		if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+		    unpacked.bi_nlink != 0)
+			return "flagged as unlinked but bi_nlink != 0";
+
+		return NULL;
+	}
+	case BCH_INODE_BLOCKDEV:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
+			return "incorrect value size";
+
+		if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
+			return "blockdev inode in fs range";
+
+		return NULL;
+	case BCH_INODE_GENERATION:
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
+			return "incorrect value size";
+
+		return NULL;
+	default:
+		return "invalid type";
+	}
+}
+
+void bch2_inode_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = out + size;
+	struct bkey_s_c_inode inode;
+	struct bch_inode_unpacked unpacked;
+
+	switch (k.k->type) {
+	case BCH_INODE_FS:
+		inode = bkey_s_c_to_inode(k);
+		if (bch2_inode_unpack(inode, &unpacked)) {
+			out += scnprintf(out, end - out, "(unpack error)");
+			break;
+		}
+
+#define BCH_INODE_FIELD(_name, _bits)						\
+		out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+		BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+		break;
+	}
+}
+
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+		     struct bch_inode_unpacked *parent)
+{
+	s64 now = bch2_current_time(c);
+
+	memset(inode_u, 0, sizeof(*inode_u));
+
+	/* ick */
+	inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
+	get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
+
+	inode_u->bi_mode	= mode;
+	inode_u->bi_uid		= uid;
+	inode_u->bi_gid		= gid;
+	inode_u->bi_dev		= rdev;
+	inode_u->bi_atime	= now;
+	inode_u->bi_mtime	= now;
+	inode_u->bi_ctime	= now;
+	inode_u->bi_otime	= now;
+
+	if (parent) {
+#define BCH_INODE_FIELD(_name)	inode_u->_name = parent->_name;
+		BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+	}
+}
+
+static inline u32 bkey_generation(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case BCH_INODE_BLOCKDEV:
+	case BCH_INODE_FS:
+		BUG();
+	case BCH_INODE_GENERATION:
+		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
+	default:
+		return 0;
+	}
+}
+
+int __bch2_inode_create(struct btree_trans *trans,
+			struct bch_inode_unpacked *inode_u,
+			u64 min, u64 max, u64 *hint)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_inode_buf *inode_p;
+	struct btree_iter *iter;
+	u64 start;
+	int ret;
+
+	if (!max)
+		max = ULLONG_MAX;
+
+	if (c->opts.inodes_32bit)
+		max = min_t(u64, max, U32_MAX);
+
+	start = READ_ONCE(*hint);
+
+	if (start >= max || start < min)
+		start = min;
+
+	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	iter = bch2_trans_get_iter(trans,
+			BTREE_ID_INODES, POS(start, 0),
+			BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+again:
+	while (1) {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+
+		ret = btree_iter_err(k);
+		if (ret)
+			return ret;
+
+		switch (k.k->type) {
+		case BCH_INODE_BLOCKDEV:
+		case BCH_INODE_FS:
+			/* slot used */
+			if (iter->pos.inode >= max)
+				goto out;
+
+			bch2_btree_iter_next_slot(iter);
+			break;
+
+		default:
+			*hint			= k.k->p.inode;
+			inode_u->bi_inum	= k.k->p.inode;
+			inode_u->bi_generation	= bkey_generation(k);
+
+			bch2_inode_pack(inode_p, inode_u);
+			bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+			return 0;
+		}
+	}
+out:
+	if (start != min) {
+		/* Retry from start */
+		start = min;
+		bch2_btree_iter_set_pos(iter, POS(start, 0));
+		goto again;
+	}
+
+	return -ENOSPC;
+}
+
+int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
+		      u64 min, u64 max, u64 *hint)
+{
+	return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
+			__bch2_inode_create(&trans, inode_u, min, max, hint));
+}
+
+int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
+			struct extent_insert_hook *hook, u64 *journal_seq)
+{
+	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+				       POS(inode_nr, new_size),
+				       POS(inode_nr + 1, 0),
+				       ZERO_VERSION, NULL, hook,
+				       journal_seq);
+}
+
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
+{
+	struct btree_iter iter;
+	struct bkey_i_inode_generation delete;
+	int ret;
+
+	ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     ZERO_VERSION, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If this was a directory, there shouldn't be any real dirents left -
+	 * but there could be whiteouts (from hash collisions) that we should
+	 * delete:
+	 *
+	 * XXX: the dirent could ideally would delete whiteouts when they're no
+	 * longer needed
+	 */
+	ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+				     POS(inode_nr, 0),
+				     POS(inode_nr + 1, 0),
+				     ZERO_VERSION, NULL, NULL, NULL);
+	if (ret < 0)
+		return ret;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	do {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+		u32 bi_generation = 0;
+
+		ret = btree_iter_err(k);
+		if (ret) {
+			bch2_btree_iter_unlock(&iter);
+			return ret;
+		}
+
+		bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
+					"inode %llu not found when deleting",
+					inode_nr);
+
+		switch (k.k->type) {
+		case BCH_INODE_FS: {
+			struct bch_inode_unpacked inode_u;
+
+			if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+				bi_generation = inode_u.bi_generation + 1;
+			break;
+		}
+		case BCH_INODE_GENERATION: {
+			struct bkey_s_c_inode_generation g =
+				bkey_s_c_to_inode_generation(k);
+			bi_generation = le32_to_cpu(g.v->bi_generation);
+			break;
+		}
+		}
+
+		if (!bi_generation) {
+			bkey_init(&delete.k);
+			delete.k.p.inode = inode_nr;
+		} else {
+			bkey_inode_generation_init(&delete.k_i);
+			delete.k.p.inode = inode_nr;
+			delete.v.bi_generation = cpu_to_le32(bi_generation);
+		}
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL,
+				BTREE_INSERT_ENTRY(&iter, &delete.k_i));
+	} while (ret == -EINTR);
+
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+			    struct bch_inode_unpacked *inode)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = -ENOENT;
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES,
+			   POS(inode_nr, 0),
+			   BTREE_ITER_SLOTS, k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+			break;
+		default:
+			/* hole, not found */
+			break;
+		}
+
+		break;
+
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void)
+{
+	struct bch_inode_unpacked *u, test_inodes[] = {
+		{
+			.bi_atime	= U64_MAX,
+			.bi_ctime	= U64_MAX,
+			.bi_mtime	= U64_MAX,
+			.bi_otime	= U64_MAX,
+			.bi_size	= U64_MAX,
+			.bi_sectors	= U64_MAX,
+			.bi_uid		= U32_MAX,
+			.bi_gid		= U32_MAX,
+			.bi_nlink	= U32_MAX,
+			.bi_generation	= U32_MAX,
+			.bi_dev		= U32_MAX,
+		},
+	};
+
+	for (u = test_inodes;
+	     u < test_inodes + ARRAY_SIZE(test_inodes);
+	     u++) {
+		struct bkey_inode_buf p;
+
+		bch2_inode_pack(&p, u);
+	}
+}
+#endif
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_H
+#define _BCACHEFS_INODE_H
+
+#include "opts.h"
+
+#include <linux/math64.h>
+
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_inode_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_inode_invalid,		\
+	.val_to_text	= bch2_inode_to_text,		\
+}
+
+struct bch_inode_unpacked {
+	u64			bi_inum;
+	__le64			bi_hash_seed;
+	u32			bi_flags;
+	u16			bi_mode;
+
+#define BCH_INODE_FIELD(_name, _bits)	u##_bits _name;
+	BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
+};
+
+struct bkey_inode_buf {
+	struct bkey_i_inode	inode;
+
+#define BCH_INODE_FIELD(_name, _bits)		+ 8 + _bits / 8
+	u8		_pad[0 + BCH_INODE_FIELDS()];
+#undef  BCH_INODE_FIELD
+} __attribute__((packed, aligned(8)));
+
+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
+		     uid_t, gid_t, umode_t, dev_t,
+		     struct bch_inode_unpacked *);
+
+int __bch2_inode_create(struct btree_trans *,
+			struct bch_inode_unpacked *,
+			u64, u64, u64 *);
+int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
+		      u64, u64, u64 *);
+
+int bch2_inode_truncate(struct bch_fs *, u64, u64,
+		       struct extent_insert_hook *, u64 *);
+int bch2_inode_rm(struct bch_fs *, u64);
+
+int bch2_inode_find_by_inum(struct bch_fs *, u64,
+			   struct bch_inode_unpacked *);
+
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+	struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (inode->bi_##_name)						\
+		opt_set(ret, _name, inode->bi_##_name - 1);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define BCH_INODE_OPT(_name, ...)					\
+	case Opt_##_name:						\
+		inode->bi_##_name = v;					\
+		break;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	default:
+		BUG();
+	}
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+				      enum bch_opt_id id, u64 v)
+{
+	return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+					enum bch_opt_id id)
+{
+	return __bch2_inode_opt_set(inode, id, 0);
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void);
+#else
+static inline void bch2_inode_pack_test(void) {}
+#endif
+
+#endif /* _BCACHEFS_INODE_H */
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_H
+#define _BCACHEFS_IO_H
+
+#include "alloc.h"
+#include "checksum.h"
+#include "io_types.h"
+
+#define to_wbio(_bio)			\
+	container_of((_bio), struct bch_write_bio, bio)
+
+#define to_rbio(_bio)			\
+	container_of((_bio), struct bch_read_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       enum bch_data_type, const struct bkey_i *);
+
+#define BLK_STS_REMOVED		((__force blk_status_t)128)
+
+enum bch_write_flags {
+	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
+	BCH_WRITE_CACHED		= (1 << 1),
+	BCH_WRITE_FLUSH			= (1 << 2),
+	BCH_WRITE_DATA_ENCODED		= (1 << 3),
+	BCH_WRITE_PAGES_STABLE		= (1 << 4),
+	BCH_WRITE_PAGES_OWNED		= (1 << 5),
+	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
+	BCH_WRITE_NOPUT_RESERVATION	= (1 << 7),
+	BCH_WRITE_NOMARK_REPLICAS	= (1 << 8),
+
+	/* Internal: */
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
+};
+
+static inline u64 *op_journal_seq(struct bch_write_op *op)
+{
+	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
+		? op->journal_seq_p : &op->journal_seq;
+}
+
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+	op->journal_seq_p = journal_seq;
+	op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct bch_io_opts opts)
+{
+	op->c			= c;
+	op->io_wq		= index_update_wq(op);
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c, opts.data_checksum);
+	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->alloc_reserve	= RESERVE_NONE;
+	op->open_buckets_nr	= 0;
+	op->devs_have.nr	= 0;
+	op->target		= 0;
+	op->opts		= opts;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->journal_seq		= 0;
+	op->index_update_fn	= bch2_write_index_default;
+}
+
+void bch2_write(struct closure *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+	struct bch_write_bio *wbio = to_wbio(bio);
+
+	memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+	return wbio;
+}
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_pick_ptr;
+
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
+};
+
+static inline void bch2_read_extent(struct bch_fs *c,
+				    struct bch_read_bio *rbio,
+				    struct bkey_s_c k,
+				    unsigned flags)
+{
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+					     struct bch_io_opts opts)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);
+
+	rbio->_state	= 0;
+	rbio->promote	= NULL;
+	rbio->opts	= opts;
+	return rbio;
+}
+
+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_H */
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_TYPES_H
+#define _BCACHEFS_IO_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_read_bio {
+	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
+
+	/*
+	 * Reads will often have to be split, and if the extent being read from
+	 * was checksummed or compressed we'll also have to allocate bounce
+	 * buffers and copy the data back into the original bio.
+	 *
+	 * If we didn't have to split, we have to save and restore the original
+	 * bi_end_io - @split below indicates which:
+	 */
+	union {
+	struct bch_read_bio	*parent;
+	bio_end_io_t		*end_io;
+	};
+
+	/*
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
+	 * resubmit on IO error, and also to copy data back to the original bio
+	 * when we're bouncing:
+	 */
+	struct bvec_iter	bvec_iter;
+
+	u16			flags;
+	union {
+	struct {
+	u16			bounce:1,
+				split:1,
+				kmalloc:1,
+				have_ioref:1,
+				narrow_crcs:1,
+				hole:1,
+				retry:2,
+				context:2;
+	};
+	u16			_state;
+	};
+
+	struct bch_devs_list	devs_have;
+
+	struct extent_pick_ptr	pick;
+	/* start pos of data we read (may not be pos of data we want) */
+	struct bpos		pos;
+	struct bversion		version;
+
+	struct promote_op	*promote;
+
+	struct bch_io_opts	opts;
+
+	struct work_struct	work;
+
+	struct bio		bio;
+};
+
+struct bch_write_bio {
+	struct_group(wbio,
+	struct bch_fs		*c;
+	struct bch_write_bio	*parent;
+
+	u64			submit_time;
+
+	struct bch_devs_list	failed;
+	u8			order;
+	u8			dev;
+
+	unsigned		split:1,
+				bounce:1,
+				put_bio:1,
+				have_ioref:1,
+				used_mempool:1;
+	);
+
+	struct bio		bio;
+};
+
+struct bch_write_op {
+	struct closure		cl;
+	struct bch_fs		*c;
+	struct workqueue_struct	*io_wq;
+	u64			start_time;
+
+	unsigned		written; /* sectors */
+	u16			flags;
+	s16			error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+	unsigned		csum_type:4;
+	unsigned		compression_type:4;
+	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
+	unsigned		alloc_reserve:4;
+
+	u8			open_buckets_nr;
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
+
+	struct bch_io_opts	opts;
+
+	struct bpos		pos;
+	struct bversion		version;
+
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
+
+	struct write_point_specifier write_point;
+
+	struct disk_reservation	res;
+
+	u8			open_buckets[16];
+
+	/*
+	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
+	 * still need to stash the journal_seq somewhere:
+	 */
+	union {
+		u64			*journal_seq_p;
+		u64			journal_seq;
+	};
+
+	int			(*index_update_fn)(struct bch_write_op *);
+
+	struct bch_devs_mask	failed;
+
+	struct keylist		insert_keys;
+	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+	/* Must be last: */
+	struct bch_write_bio	wbio;
+};
+
+#endif /* _BCACHEFS_IO_TYPES_H */
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_H
+#define _BCACHEFS_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The primary purpose of the journal is to log updates (insertions) to the
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
+ *
+ * Without the journal, the b-tree is always internally consistent on
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
+ * but did handle unclean shutdowns by doing all index updates synchronously
+ * (with coalescing).
+ *
+ * Updates to interior nodes still happen synchronously and without the journal
+ * (for simplicity) - this may change eventually but updates to interior nodes
+ * are rare enough it's not a huge priority.
+ *
+ * This means the journal is relatively separate from the b-tree; it consists of
+ * just a list of keys and journal replay consists of just redoing those
+ * insertions in same order that they appear in the journal.
+ *
+ * PERSISTENCE:
+ *
+ * For synchronous updates (where we're waiting on the index update to hit
+ * disk), the journal entry will be written out immediately (or as soon as
+ * possible, if the write for the previous journal entry was still in flight).
+ *
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
+ * down to the journalling code. That closure will will wait on the journal
+ * write to complete (via closure_wait()).
+ *
+ * If the index update wasn't synchronous, the journal entry will be
+ * written out after 10 ms have elapsed, by default (the delay_ms field
+ * in struct journal).
+ *
+ * JOURNAL ENTRIES:
+ *
+ * A journal entry is variable size (struct jset), it's got a fixed length
+ * header and then a variable number of struct jset_entry entries.
+ *
+ * Journal entries are identified by monotonically increasing 64 bit sequence
+ * numbers - jset->seq; other places in the code refer to this sequence number.
+ *
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
+ * into the b-tree). We need a container to indicate which b-tree the key is
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
+ * (one for each b-tree) - this lets us add new b-tree types without changing
+ * the on disk format.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * JOURNAL LAYOUT ON DISK:
+ *
+ * The journal is written to a ringbuffer of buckets (which is kept in the
+ * superblock); the individual buckets are not necessarily contiguous on disk
+ * which means that journal entries are not allowed to span buckets, but also
+ * that we can resize the journal at runtime if desired (unimplemented).
+ *
+ * The journal buckets exist in the same pool as all the other buckets that are
+ * managed by the allocator and garbage collection - garbage collection marks
+ * the journal buckets as metadata buckets.
+ *
+ * OPEN/DIRTY JOURNAL ENTRIES:
+ *
+ * Open/dirty journal entries are journal entries that contain b-tree updates
+ * that have not yet been written out to the b-tree on disk. We have to track
+ * which journal entries are dirty, and we also have to avoid wrapping around
+ * the journal and overwriting old but still dirty journal entries with new
+ * journal entries.
+ *
+ * On disk, this is represented with the "last_seq" field of struct jset;
+ * last_seq is the first sequence number that journal replay has to replay.
+ *
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
+ * journal_device->seq) of for each journal bucket, the highest sequence number
+ * any journal entry it contains. Then, by comparing that against last_seq we
+ * can determine whether that journal bucket contains dirty journal entries or
+ * not.
+ *
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
+ * (where each entry corresponds to a specific sequence number) - when a ref
+ * goes to 0, that journal entry is no longer dirty.
+ *
+ * Journalling of index updates is done at the same time as the b-tree itself is
+ * being modified (see btree_insert_key()); when we add the key to the journal
+ * the pending b-tree write takes a ref on the journal entry the key was added
+ * to. If a pending b-tree write would need to take refs on multiple dirty
+ * journal entries, it only keeps the ref on the oldest one (since a newer
+ * journal entry will still be replayed if an older entry was dirty).
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#include <linux/hash.h>
+
+#include "journal_types.h"
+
+struct bch_fs;
+
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+}
+
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	return j->pin.back - 1;
+}
+
+u64 bch2_inode_journal_seq(struct journal *, u64);
+
+static inline int journal_state_count(union journal_res_state s, int idx)
+{
+	return idx == 0 ? s.buf0_count : s.buf1_count;
+}
+
+static inline void journal_state_inc(union journal_res_state *s)
+{
+	s->buf0_count += s->idx == 0;
+	s->buf1_count += s->idx == 1;
+}
+
+static inline void bch2_journal_set_has_inode(struct journal *j,
+					      struct journal_res *res,
+					      u64 inum)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
+
+	/* avoid atomic op if possible */
+	if (unlikely(!test_bit(bit, buf->has_inode)))
+		set_bit(bit, buf->has_inode);
+}
+
+/*
+ * Amount of space that will be taken up by some keys in the journal (i.e.
+ * including the jset header)
+ */
+static inline unsigned jset_u64s(unsigned u64s)
+{
+	return u64s + sizeof(struct jset_entry) / sizeof(u64);
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
+{
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s = cpu_to_le16(u64s);
+
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
+}
+
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+					  unsigned type, enum btree_id id,
+					  unsigned level,
+					  const void *data, unsigned u64s)
+{
+	struct journal_buf *buf = &j->buf[res->idx];
+	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
+	unsigned actual = jset_u64s(u64s);
+
+	EBUG_ON(!res->ref);
+	EBUG_ON(actual > res->u64s);
+
+	res->offset	+= actual;
+	res->u64s	-= actual;
+
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->btree_id = id;
+	entry->level	= level;
+	entry->type	= type;
+	entry->pad[0]	= 0;
+	entry->pad[1]	= 0;
+	entry->pad[2]	= 0;
+	memcpy_u64s(entry->_data, data, u64s);
+}
+
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
+					enum btree_id id, const struct bkey_i *k)
+{
+	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
+			       id, 0, k, k->k.u64s);
+}
+
+void bch2_journal_buf_put_slowpath(struct journal *, bool);
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
+				       bool need_write_just_set)
+{
+	union journal_res_state s;
+
+	s.v = atomic64_sub_return(((union journal_res_state) {
+				    .buf0_count = idx == 0,
+				    .buf1_count = idx == 1,
+				    }).v, &j->reservations.counter);
+
+	EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
+
+	/*
+	 * Do not initiate a journal write if the journal is in an error state
+	 * (previous journal entry write may have failed)
+	 */
+	if (s.idx != idx &&
+	    !journal_state_count(s, idx) &&
+	    s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
+		bch2_journal_buf_put_slowpath(j, need_write_just_set);
+}
+
+/*
+ * This function releases the journal write structure so other threads can
+ * then proceed to add their keys as well.
+ */
+static inline void bch2_journal_res_put(struct journal *j,
+				       struct journal_res *res)
+{
+	if (!res->ref)
+		return;
+
+	lock_release(&j->res_map, _RET_IP_);
+
+	while (res->u64s)
+		bch2_journal_add_entry(j, res,
+				       BCH_JSET_ENTRY_btree_keys,
+				       0, 0, NULL, 0);
+
+	bch2_journal_buf_put(j, res->idx, false);
+
+	res->ref = 0;
+}
+
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
+				 unsigned, unsigned);
+
+static inline int journal_res_get_fast(struct journal *j,
+				       struct journal_res *res,
+				       unsigned u64s_min,
+				       unsigned u64s_max)
+{
+	union journal_res_state old, new;
+	u64 v = atomic64_read(&j->reservations.counter);
+
+	do {
+		old.v = new.v = v;
+
+		/*
+		 * Check if there is still room in the current journal
+		 * entry:
+		 */
+		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+			return 0;
+
+		res->offset	= old.cur_entry_offset;
+		res->u64s	= min(u64s_max, j->cur_entry_u64s -
+				      old.cur_entry_offset);
+
+		journal_state_inc(&new);
+		new.cur_entry_offset += res->u64s;
+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
+				       old.v, new.v)) != old.v);
+
+	res->ref = true;
+	res->idx = new.idx;
+	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	return 1;
+}
+
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
+				      unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	EBUG_ON(res->ref);
+	EBUG_ON(u64s_max < u64s_min);
+	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+
+	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+		goto out;
+
+	ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+out:
+	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+	EBUG_ON(!res->ref);
+	return 0;
+}
+
+u64 bch2_journal_last_unwritten_seq(struct journal *);
+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
+
+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
+void bch2_journal_flush_async(struct journal *, struct closure *);
+void bch2_journal_meta_async(struct journal *, struct closure *);
+
+int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush(struct journal *);
+int bch2_journal_meta(struct journal *);
+
+void bch2_journal_halt(struct journal *);
+
+static inline int bch2_journal_error(struct journal *j)
+{
+	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
+		? -EIO : 0;
+}
+
+struct bch_dev;
+
+static inline bool journal_flushes_device(struct bch_dev *ca)
+{
+	return true;
+}
+
+int bch2_journal_mark(struct bch_fs *, struct list_head *);
+void bch2_journal_entries_free(struct list_head *);
+int bch2_journal_replay(struct bch_fs *, struct list_head *);
+
+static inline void bch2_journal_set_replay_done(struct journal *j)
+{
+	BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
+	set_bit(JOURNAL_REPLAY_DONE, &j->flags);
+}
+
+ssize_t bch2_journal_print_debug(struct journal *, char *);
+ssize_t bch2_journal_print_pins(struct journal *, char *);
+
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+				unsigned nr);
+int bch2_dev_journal_alloc(struct bch_dev *);
+
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
+void bch2_fs_journal_stop(struct journal *);
+void bch2_fs_journal_start(struct journal *);
+void bch2_dev_journal_exit(struct bch_dev *);
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
+void bch2_fs_journal_exit(struct journal *);
+int bch2_fs_journal_init(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_H */
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
+		vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+int bch2_journal_entry_sectors(struct journal *);
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+				  struct journal_entry_pin_list *pin_list)
+{
+	return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+	u64 ret = 0;
+
+	spin_lock(&j->lock);
+	if (journal_pin_active(pin))
+		ret = journal_pin_seq(j, pin->pin_list);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+				     struct journal_entry_pin_list *pin_list,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	BUG_ON(journal_pin_active(pin));
+	BUG_ON(!atomic_read(&pin_list->count));
+
+	atomic_inc(&pin_list->count);
+	pin->pin_list	= pin_list;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		INIT_LIST_HEAD(&pin->list);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+	spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	pin->pin_list = NULL;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	if (atomic_dec_and_test(&pin_list->count) &&
+	    pin_list == &fifo_peek_front(&j->pin))
+		bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			  struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	__journal_pin_drop(j, pin);
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     journal_pin_seq(j, src_pin->pin_list) <
+	     journal_pin_seq(j, pin->pin_list))) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret;
+	u64 iter;
+
+	/* no need to iterate over empty fifo entries: */
+	bch2_journal_reclaim_fast(j);
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+		if (iter > seq_to_flush)
+			break;
+
+		ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+		if (ret) {
+			/* must be list_del_init(), see bch2_journal_pin_drop() */
+			list_move(&ret->list, &pin_list->flushed);
+			*seq = iter;
+			return ret;
+		}
+	}
+
+	return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin *ret;
+
+	spin_lock(&j->lock);
+	ret = __journal_get_next_pin(j, seq_to_flush, seq);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->nr &&
+		(ja->last_idx != ja->cur_idx &&
+		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+				struct bch_fs, journal.reclaim_work);
+	struct journal *j = &c->journal;
+	struct bch_dev *ca;
+	struct journal_entry_pin *pin;
+	u64 seq, seq_to_flush = 0;
+	unsigned iter, bucket_to_flush;
+	unsigned long next_flush;
+	bool reclaim_lock_held = false, need_flush;
+
+	/*
+	 * Advance last_idx to point to the oldest journal entry containing
+	 * btree node updates that have not yet been written out
+	 */
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!reclaim_lock_held) {
+				/*
+				 * ugh:
+				 * might be called from __journal_res_get()
+				 * under wait_event() - have to go back to
+				 * TASK_RUNNING before doing something that
+				 * would block, but only if we're doing work:
+				 */
+				__set_current_state(TASK_RUNNING);
+
+				mutex_lock(&j->reclaim_lock);
+				reclaim_lock_held = true;
+				/* recheck under reclaim_lock: */
+				continue;
+			}
+
+			if (ca->mi.discard &&
+			    bdev_max_discard_sectors(ca->disk_sb.bdev))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->last_idx]),
+					ca->mi.bucket_size, GFP_NOIO);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+			spin_unlock(&j->lock);
+
+			journal_wake(j);
+		}
+
+		/*
+		 * Write out enough btree nodes to free up 50% journal
+		 * buckets
+		 */
+		spin_lock(&j->lock);
+		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+		seq_to_flush = max_t(u64, seq_to_flush,
+				     ja->bucket_seq[bucket_to_flush]);
+		spin_unlock(&j->lock);
+	}
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
+	/* Also flush if the pin fifo is more than half full */
+	spin_lock(&j->lock);
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	need_flush = time_after(jiffies, next_flush);
+
+	while ((pin = journal_get_next_pin(j, need_flush
+					   ? U64_MAX
+					   : seq_to_flush, &seq))) {
+		__set_current_state(TASK_RUNNING);
+		pin->flush(j, pin, seq);
+		need_flush = false;
+
+		j->last_flushed = jiffies;
+	}
+
+	if (!test_bit(BCH_FS_RO, &c->flags))
+		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      struct journal_entry_pin **pin,
+			      u64 *pin_seq)
+{
+	int ret;
+
+	*pin = NULL;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		(fifo_used(&j->pin) == 1 &&
+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	struct journal_entry_pin *pin;
+	u64 pin_seq;
+
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return;
+
+	while (1) {
+		wait_event(j->wait, journal_flush_done(j, seq_to_flush,
+						       &pin, &pin_seq));
+		if (!pin)
+			break;
+
+		pin->flush(j, pin, pin_seq);
+	}
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_pins(j, seq);
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < j->pin.back) {
+		seq = max(seq, journal_last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+void bch2_journal_flush_pins(struct journal *, u64);
+
+static inline void bch2_journal_flush_all_pins(struct journal *j)
+{
+	bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+					struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c =
+		container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl =
+		container_of(pin, struct journal_seq_blacklist, pin);
+	struct blacklisted_node n;
+	struct closure cl;
+	unsigned i;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	for (i = 0;; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
+				       0, 0, BTREE_ITER_NODES);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		/* The node might have already been rewritten: */
+
+		if (b->data->keys.seq == n.seq) {
+			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+			if (ret) {
+				bch2_btree_iter_unlock(&iter);
+				bch2_fs_fatal_error(c,
+					"error %i rewriting btree node with blacklisted journal seq",
+					ret);
+				bch2_journal_halt(j);
+				return;
+			}
+		}
+
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	for (i = 0;; i++) {
+		struct btree_update *as;
+		struct pending_btree_node_free *d;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+redo_wait:
+		mutex_lock(&c->btree_interior_update_lock);
+
+		/*
+		 * Is the node on the list of pending interior node updates -
+		 * being freed? If so, wait for that to finish:
+		 */
+		for_each_pending_btree_node_free(c, as, d)
+			if (n.seq	== d->seq &&
+			    n.btree_id	== d->btree_id &&
+			    !d->level &&
+			    !bkey_cmp(n.pos, d->key.k.p)) {
+				closure_wait(&as->wait, &cl);
+				mutex_unlock(&c->btree_interior_update_lock);
+				closure_sync(&cl);
+				goto redo_wait;
+			}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+	}
+
+	mutex_lock(&j->blacklist_lock);
+
+	bch2_journal_pin_drop(j, &bl->pin);
+	list_del(&bl->list);
+	kfree(bl->entries);
+	kfree(bl);
+
+	mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (seq >= bl->start && seq <= bl->end)
+			return bl;
+
+	return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	/*
+	 * When we start the journal, bch2_journal_start() will skip over @seq:
+	 */
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return NULL;
+
+	bl->start	= start;
+	bl->end		= end;
+
+	list_add_tail(&bl->list, &j->seq_blacklist);
+	return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl = NULL;
+	struct blacklisted_node *n;
+	u64 journal_seq;
+	int ret = 0;
+
+	if (!seq)
+		return 0;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	/* Interier updates aren't journalled: */
+	BUG_ON(b->level);
+	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+	/*
+	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
+	 * increasing it temporarily to work around bug in old kernels
+	 */
+	fsck_err_on(seq > journal_seq + 4, c,
+		    "bset journal seq too far in the future: %llu > %llu",
+		    seq, journal_seq);
+
+	if (seq <= journal_seq &&
+	    list_empty_careful(&j->seq_blacklist))
+		return 0;
+
+	mutex_lock(&j->blacklist_lock);
+
+	if (seq <= journal_seq) {
+		bl = bch2_journal_seq_blacklist_find(j, seq);
+		if (!bl)
+			goto out;
+	} else {
+		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+		if (!j->new_blacklist) {
+			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+						journal_seq + 1,
+						journal_seq + 1);
+			if (!j->new_blacklist) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+		bl = j->new_blacklist;
+		bl->end = max(bl->end, seq);
+	}
+
+	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+		if (b->data->keys.seq	== n->seq &&
+		    b->btree_id		== n->btree_id &&
+		    !bkey_cmp(b->key.k.p, n->pos))
+			goto found_entry;
+
+	if (!bl->nr_entries ||
+	    is_power_of_2(bl->nr_entries)) {
+		n = krealloc(bl->entries,
+			     max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
+			     GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bl->entries = n;
+	}
+
+	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+		.seq		= b->data->keys.seq,
+		.btree_id	= b->btree_id,
+		.pos		= b->key.k.p,
+	};
+found_entry:
+	ret = 1;
+out:
+fsck_err:
+	mutex_unlock(&j->blacklist_lock);
+	return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+					     struct journal_replay *i,
+					     u64 start, u64 end)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl;
+
+	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+		    start, end);
+
+	bl = bch2_journal_seq_blacklisted_new(j, start, end);
+	if (!bl)
+		return -ENOMEM;
+
+	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+			     journal_seq_blacklist_flush);
+	return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+				    struct journal_replay *i)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(&i->j, entry) {
+		switch (entry->type) {
+		case BCH_JSET_ENTRY_blacklist: {
+			struct jset_entry_blacklist *bl_entry =
+				container_of(entry, struct jset_entry_blacklist, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->seq),
+					le64_to_cpu(bl_entry->seq));
+			break;
+		}
+		case BCH_JSET_ENTRY_blacklist_v2: {
+			struct jset_entry_blacklist_v2 *bl_entry =
+				container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->start),
+					le64_to_cpu(bl_entry->end));
+			break;
+		}
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+	struct journal_seq_blacklist *bl = j->new_blacklist;
+	struct jset_entry_blacklist_v2 *bl_entry;
+	struct jset_entry *entry;
+
+	if (!bl)
+		return;
+
+	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
+	bl_entry->start		= cpu_to_le64(bl->start);
+	bl_entry->end		= cpu_to_le64(bl->end);
+
+	bch2_journal_pin_add(j,
+			     journal_cur_seq(j),
+			     &bl->pin,
+			     journal_seq_blacklist_flush);
+
+	j->new_blacklist = NULL;
+}
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_replay;
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+				    struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
+#define _BCACHEFS_JOURNAL_TYPES_H
+
+#include <linux/cache.h>
+#include <linux/workqueue.h>
+
+#include "alloc_types.h"
+#include "super_types.h"
+#include "fifo.h"
+
+struct journal_res;
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_buf {
+	struct jset		*data;
+
+	BKEY_PADDED(key);
+
+	struct closure_waitlist	wait;
+
+	unsigned		size;
+	unsigned		disk_sectors;
+	/* bloom filter: */
+	unsigned long		has_inode[1024 / sizeof(unsigned long)];
+};
+
+/*
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
+ * flushed:
+ */
+
+struct journal_entry_pin_list {
+	struct list_head		list;
+	struct list_head		flushed;
+	atomic_t			count;
+	struct bch_devs_list		devs;
+};
+
+struct journal;
+struct journal_entry_pin;
+typedef void (*journal_pin_flush_fn)(struct journal *j,
+				struct journal_entry_pin *, u64);
+
+struct journal_entry_pin {
+	struct list_head		list;
+	journal_pin_flush_fn		flush;
+	struct journal_entry_pin_list	*pin_list;
+};
+
+/* corresponds to a btree node with a blacklisted bset: */
+struct blacklisted_node {
+	__le64			seq;
+	enum btree_id		btree_id;
+	struct bpos		pos;
+};
+
+struct journal_seq_blacklist {
+	struct list_head	list;
+	u64			start;
+	u64			end;
+
+	struct journal_entry_pin pin;
+
+	struct blacklisted_node	*entries;
+	size_t			nr_entries;
+};
+
+struct journal_res {
+	bool			ref;
+	u8			idx;
+	u16			u64s;
+	u32			offset;
+	u64			seq;
+};
+
+union journal_res_state {
+	struct {
+		atomic64_t	counter;
+	};
+
+	struct {
+		u64		v;
+	};
+
+	struct {
+		u64		cur_entry_offset:20,
+				idx:1,
+				prev_buf_unwritten:1,
+				buf0_count:21,
+				buf1_count:21;
+	};
+};
+
+/* bytes: */
+#define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
+
+/*
+ * We stash some journal state as sentinal values in cur_entry_offset:
+ * note - cur_entry_offset is in units of u64s
+ */
+#define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
+
+#define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
+#define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
+
+/*
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
+ * either because something's waiting on the write to complete or because it's
+ * been dirty too long and the timer's expired.
+ */
+
+enum {
+	JOURNAL_REPLAY_DONE,
+	JOURNAL_STARTED,
+	JOURNAL_NEED_WRITE,
+	JOURNAL_NOT_EMPTY,
+};
+
+/* Embedded in struct bch_fs */
+struct journal {
+	/* Fastpath stuff up front: */
+
+	unsigned long		flags;
+
+	union journal_res_state reservations;
+	unsigned		cur_entry_u64s;
+	unsigned		prev_buf_sectors;
+	unsigned		cur_buf_sectors;
+	unsigned		buf_size_want;
+
+	/*
+	 * Two journal entries -- one is currently open for new entries, the
+	 * other is possibly being written out.
+	 */
+	struct journal_buf	buf[2];
+
+	spinlock_t		lock;
+
+	/* Used when waiting because the journal was full */
+	wait_queue_head_t	wait;
+	struct closure_waitlist	async_wait;
+
+	struct closure		io;
+	struct delayed_work	write_work;
+
+	/* Sequence number of most recent journal entry (last entry in @pin) */
+	atomic64_t		seq;
+
+	/* last_seq from the most recent journal entry written */
+	u64			last_seq_ondisk;
+
+	/*
+	 * FIFO of journal entries whose btree updates have not yet been
+	 * written out.
+	 *
+	 * Each entry is a reference count. The position in the FIFO is the
+	 * entry's sequence number relative to @seq.
+	 *
+	 * The journal entry itself holds a reference count, put when the
+	 * journal entry is written out. Each btree node modified by the journal
+	 * entry also holds a reference count, put when the btree node is
+	 * written.
+	 *
+	 * When a reference count reaches zero, the journal entry is no longer
+	 * needed. When all journal entries in the oldest journal bucket are no
+	 * longer needed, the bucket can be discarded and reused.
+	 */
+	struct {
+		u64 front, back, size, mask;
+		struct journal_entry_pin_list *data;
+	}			pin;
+	u64			replay_journal_seq;
+
+	struct mutex		blacklist_lock;
+	struct list_head	seq_blacklist;
+	struct journal_seq_blacklist *new_blacklist;
+
+	BKEY_PADDED(key);
+	struct write_point	wp;
+	spinlock_t		err_lock;
+
+	struct delayed_work	reclaim_work;
+	unsigned long		last_flushed;
+
+	/* protects advancing ja->last_idx: */
+	struct mutex		reclaim_lock;
+	unsigned		write_delay_ms;
+	unsigned		reclaim_delay_ms;
+
+	u64			res_get_blocked_start;
+	u64			need_write_time;
+	u64			write_start_time;
+
+	struct bch2_time_stats	*write_time;
+	struct bch2_time_stats	*delay_time;
+	struct bch2_time_stats	*blocked_time;
+	struct bch2_time_stats	*flush_seq_time;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	res_map;
+#endif
+};
+
+/*
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
+ * buckets, in bch_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	u64			*bucket_seq;
+
+	unsigned		sectors_free;
+
+	/* Journal bucket we're currently writing to */
+	unsigned		cur_idx;
+
+	/* Last journal bucket that still contains an open journal entry */
+
+	/*
+	 * j->lock and j->reclaim_lock must both be held to modify, j->lock
+	 * sufficient to read:
+	 */
+	unsigned		last_idx;
+	unsigned		nr;
+	u64			*buckets;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		*bio;
+
+	/* for bch_journal_read_device */
+	struct closure		read;
+};
+
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "keylist.h"
+
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
+			size_t nr_inline_u64s, size_t new_u64s)
+{
+	size_t oldsize = bch_keylist_u64s(l);
+	size_t newsize = oldsize + new_u64s;
+	u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
+	u64 *new_keys;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= nr_inline_u64s ||
+	    (old_buf && roundup_pow_of_two(oldsize) == newsize))
+		return 0;
+
+	new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
+	if (!new_keys)
+		return -ENOMEM;
+
+	if (!old_buf)
+		memcpy_u64s(new_keys, inline_u64s, oldsize);
+
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
+
+	return 0;
+}
+
+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
+{
+	struct bkey_i *where;
+
+	for_each_keylist_key(l, where)
+		if (bkey_cmp(insert->k.p, where->k.p) < 0)
+			break;
+
+	memmove_u64s_up((u64 *) where + insert->k.u64s,
+			where,
+			((u64 *) l->top) - ((u64 *) where));
+
+	l->top_p += insert->k.u64s;
+	bkey_copy(where, insert);
+}
+
+void bch2_keylist_pop_front(struct keylist *l)
+{
+	l->top_p -= bch2_keylist_front(l)->k.u64s;
+
+	memmove_u64s_down(l->keys,
+			  bkey_next(l->keys),
+			  bch_keylist_u64s(l));
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+}
+#endif
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_H
+#define _BCACHEFS_KEYLIST_H
+
+#include "keylist_types.h"
+
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
+void bch2_keylist_pop_front(struct keylist *);
+
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
+{
+	l->top_p = l->keys_p = inline_keys;
+}
+
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
+{
+	if (l->keys_p != inline_keys)
+		kfree(l->keys_p);
+	bch2_keylist_init(l, inline_keys);
+}
+
+static inline void bch2_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
+{
+	bkey_copy(l->top, k);
+	bch2_keylist_push(l);
+}
+
+static inline bool bch2_keylist_empty(struct keylist *l)
+{
+	return l->top == l->keys;
+}
+
+static inline size_t bch_keylist_u64s(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch2_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_u64s(l) * sizeof(u64);
+}
+
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
+{
+	return l->keys;
+}
+
+#define for_each_keylist_key(_keylist, _k)			\
+	for (_k = (_keylist)->keys;				\
+	     _k != (_keylist)->top;				\
+	     _k = bkey_next(_k))
+
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+	struct bkey_i *k;
+	u64 ret = 0;
+
+	for_each_keylist_key(keys, k)
+		ret += k->k.size;
+
+	return ret;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
+#endif /* _BCACHEFS_KEYLIST_H */
--- a/fs/bcachefs/keylist_types.h
+++ b/fs/bcachefs/keylist_types.h
@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
+#define _BCACHEFS_KEYLIST_TYPES_H
+
+struct keylist {
+	union {
+		struct bkey_i		*keys;
+		u64			*keys_p;
+	};
+	union {
+		struct bkey_i		*top;
+		u64			*top_p;
+	};
+};
+
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for moving data off a device.
+ */
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "extents.h"
+#include "io.h"
+#include "journal.h"
+#include "keylist.h"
+#include "migrate.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+			 unsigned dev_idx, int flags, bool metadata)
+{
+	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+	unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+	unsigned nr_good;
+
+	bch2_extent_drop_device(e, dev_idx);
+
+	nr_good = bch2_extent_durability(c, e.c);
+	if ((!nr_good && !(flags & lost)) ||
+	    (nr_good < replicas && !(flags & degraded)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct bkey_s_c k;
+	struct bkey_s_extent e;
+	BKEY_PADDED(key) tmp;
+	struct btree_iter iter;
+	int ret = 0;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     POS_MIN, BTREE_ITER_PREFETCH);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		if (!bkey_extent_is_data(k.k) ||
+		    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+			ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+			if (ret)
+				break;
+			bch2_btree_iter_next(&iter);
+			continue;
+		}
+
+		bkey_reassemble(&tmp.key, k);
+		e = bkey_i_to_s_extent(&tmp.key);
+
+		ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+		if (ret)
+			break;
+
+		/*
+		 * If the new extent no longer has any pointers, bch2_extent_normalize()
+		 * will do the appropriate thing with it (turning it into a
+		 * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+		 */
+		bch2_extent_normalize(c, e.s);
+
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+					      bkey_i_to_s_c(&tmp.key));
+		if (ret)
+			break;
+
+		iter.pos = bkey_start_pos(&tmp.key.k);
+
+		ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+					   BTREE_INSERT_ATOMIC|
+					   BTREE_INSERT_NOFAIL,
+					   BTREE_INSERT_ENTRY(&iter, &tmp.key));
+
+		/*
+		 * don't want to leave ret == -EINTR, since if we raced and
+		 * something else overwrote the key we could spuriously return
+		 * -EINTR below:
+		 */
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+	}
+
+	bch2_btree_iter_unlock(&iter);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	struct btree_iter iter;
+	struct closure cl;
+	struct btree *b;
+	unsigned id;
+	int ret;
+
+	/* don't handle this yet: */
+	if (flags & BCH_FORCE_IF_METADATA_LOST)
+		return -EINVAL;
+
+	closure_init_stack(&cl);
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+			struct bkey_i_extent *new_key;
+retry:
+			if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+						    dev_idx)) {
+				/*
+				 * we might have found a btree node key we
+				 * needed to update, and then tried to update it
+				 * but got -EINTR after upgrading the iter, but
+				 * then raced and the node is now gone:
+				 */
+				bch2_btree_iter_downgrade(&iter);
+
+				ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+							      bkey_i_to_s_c(&b->key));
+				if (ret)
+					goto err;
+			} else {
+				bkey_copy(&tmp.k, &b->key);
+				new_key = bkey_i_to_extent(&tmp.k);
+
+				ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+						    dev_idx, flags, true);
+				if (ret)
+					goto err;
+
+				ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+				if (ret == -EINTR) {
+					b = bch2_btree_iter_peek_node(&iter);
+					goto retry;
+				}
+				if (ret)
+					goto err;
+			}
+		}
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	ret = 0;
+out:
+	ret = bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+err:
+	bch2_btree_iter_unlock(&iter);
+	goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+	return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+		bch2_dev_metadata_drop(c, dev_idx, flags);
+}
--- a/fs/bcachefs/migrate.h
+++ b/fs/bcachefs/migrate.h
@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MIGRATE_H
+#define _BCACHEFS_MIGRATE_H
+
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+
+#endif /* _BCACHEFS_MIGRATE_H */
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "inode.h"
+#include "io.h"
+#include "journal_reclaim.h"
+#include "keylist.h"
+#include "move.h"
+#include "replicas.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/ioprio.h>
+#include <linux/kthread.h>
+
+#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
+
+struct moving_io {
+	struct list_head	list;
+	struct closure		cl;
+	bool			read_completed;
+
+	unsigned		read_sectors;
+	unsigned		write_sectors;
+
+	struct bch_read_bio	rbio;
+
+	struct migrate_write	write;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};
+
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
+
+	struct bch_move_stats	*stats;
+
+	struct list_head	reads;
+
+	/* in flight sectors: */
+	atomic_t		read_sectors;
+	atomic_t		write_sectors;
+
+	wait_queue_head_t	wait;
+};
+
+static int bch2_migrate_index_update(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct migrate_write *m =
+		container_of(op, struct migrate_write, op);
+	struct keylist *keys = &op->insert_keys;
+	struct btree_iter iter;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     bkey_start_pos(&bch2_keylist_front(keys)->k),
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+	while (1) {
+		struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+		struct bkey_i_extent *insert, *new =
+			bkey_i_to_extent(bch2_keylist_front(keys));
+		BKEY_PADDED(k) _new, _insert;
+		struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		bool did_work = false;
+		int nr;
+
+		if (btree_iter_err(k)) {
+			ret = bch2_btree_iter_unlock(&iter);
+			break;
+		}
+
+		if (bversion_cmp(k.k->version, new->k.version) ||
+		    !bkey_extent_is_data(k.k) ||
+		    !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
+					     m->ptr, m->offset))
+			goto nomatch;
+
+		if (m->data_cmd == DATA_REWRITE &&
+		    !bch2_extent_has_device(bkey_s_c_to_extent(k),
+					    m->data_opts.rewrite_dev))
+			goto nomatch;
+
+		bkey_reassemble(&_insert.k, k);
+		insert = bkey_i_to_extent(&_insert.k);
+
+		bkey_copy(&_new.k, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(&_new.k);
+
+		bch2_cut_front(iter.pos, &insert->k_i);
+		bch2_cut_back(new->k.p, &insert->k);
+		bch2_cut_back(insert->k.p, &new->k);
+
+		if (m->data_cmd == DATA_REWRITE) {
+			ptr = (struct bch_extent_ptr *)
+				bch2_extent_has_device(extent_i_to_s_c(insert),
+						       m->data_opts.rewrite_dev);
+			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
+		}
+
+		extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
+			if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+				/*
+				 * raced with another move op? extent already
+				 * has a pointer to the device we just wrote
+				 * data to
+				 */
+				continue;
+			}
+
+			bch2_extent_crc_append(insert, crc);
+			extent_ptr_append(insert, *ptr);
+			did_work = true;
+		}
+
+		if (!did_work)
+			goto nomatch;
+
+		bch2_extent_narrow_crcs(insert,
+				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, extent_i_to_s(insert).s);
+		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
+						 op->opts.background_target,
+						 op->opts.data_replicas);
+
+		/*
+		 * It's possible we race, and for whatever reason the extent now
+		 * has fewer replicas than when we last looked at it - meaning
+		 * we need to get a disk reservation here:
+		 */
+		nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+			(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
+		if (nr > 0) {
+			/*
+			 * can't call bch2_disk_reservation_add() with btree
+			 * locks held, at least not without a song and dance
+			 */
+			bch2_btree_iter_unlock(&iter);
+
+			ret = bch2_disk_reservation_add(c, &op->res,
+					keylist_sectors(keys) * nr, 0);
+			if (ret)
+				goto out;
+
+			m->nr_ptrs_reserved += nr;
+			goto next;
+		}
+
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+					      extent_i_to_s_c(insert).s_c);
+		if (ret)
+			break;
+
+		ret = bch2_btree_insert_at(c, &op->res,
+				NULL, op_journal_seq(op),
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				BTREE_INSERT_USE_RESERVE|
+				m->data_opts.btree_insert_flags,
+				BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+		if (!ret)
+			atomic_long_inc(&c->extent_migrate_done);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+next:
+		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
+			bch2_keylist_pop_front(keys);
+			if (bch2_keylist_empty(keys))
+				goto out;
+		}
+
+		bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		continue;
+nomatch:
+		if (m->ctxt)
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->ctxt->stats->sectors_raced);
+		atomic_long_inc(&c->extent_migrate_raced);
+		trace_move_race(&new->k);
+		bch2_btree_iter_next_slot(&iter);
+		goto next;
+	}
+out:
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
+{
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->ptr		= rbio->pick.ptr;
+	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
+	m->op.devs_have	= rbio->devs_have;
+	m->op.pos	= rbio->pos;
+	m->op.version	= rbio->version;
+	m->op.crc	= rbio->pick.crc;
+	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
+		m->op.csum_type = m->op.crc.csum_type;
+	}
+
+	if (m->data_cmd == DATA_REWRITE)
+		bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+			    struct write_point_specifier wp,
+			    struct bch_io_opts io_opts,
+			    enum data_cmd data_cmd,
+			    struct data_opts data_opts,
+			    struct bkey_s_c k)
+{
+	int ret;
+
+	m->data_cmd	= data_cmd;
+	m->data_opts	= data_opts;
+	m->nr_ptrs_reserved = 0;
+
+	bch2_write_op_init(&m->op, c, io_opts);
+	m->op.compression_type =
+		bch2_compression_opt_to_type[io_opts.background_compression ?:
+					     io_opts.compression];
+	m->op.target	= data_opts.target,
+	m->op.write_point = wp;
+
+	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+		m->op.alloc_reserve = RESERVE_MOVINGGC;
+
+	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+		BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED|
+		BCH_WRITE_NOMARK_REPLICAS;
+
+	m->op.nr_replicas	= 1;
+	m->op.nr_replicas_required = 1;
+	m->op.index_update_fn	= bch2_migrate_index_update;
+
+	switch (data_cmd) {
+	case DATA_ADD_REPLICAS: {
+		int nr = (int) io_opts.data_replicas -
+			bch2_extent_nr_dirty_ptrs(k);
+
+		if (nr > 0) {
+			m->op.nr_replicas = m->nr_ptrs_reserved = nr;
+
+			ret = bch2_disk_reservation_get(c, &m->op.res,
+					k.k->size, m->op.nr_replicas, 0);
+			if (ret)
+				return ret;
+		}
+		break;
+	}
+	case DATA_REWRITE:
+		break;
+	case DATA_PROMOTE:
+		m->op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
+		m->op.flags	|= BCH_WRITE_CACHED;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static void move_free(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->write.ctxt;
+	struct bvec_iter_all iter;
+	struct bio_vec *bv;
+
+	bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+
+	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
+		if (bv->bv_page)
+			__free_page(bv->bv_page);
+
+	wake_up(&ctxt->wait);
+
+	kfree(io);
+}
+
+static void move_write_done(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+	closure_return_with_destructor(cl, move_free);
+}
+
+static void move_write(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+		closure_return_with_destructor(cl, move_free);
+		return;
+	}
+
+	bch2_migrate_read_done(&io->write, &io->rbio);
+
+	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+	closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+	continue_at(cl, move_write_done, NULL);
+}
+
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
+{
+	struct moving_io *io =
+		list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
+
+	return io && io->read_completed ? io : NULL;
+}
+
+static void move_read_endio(struct bio *bio)
+{
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;
+
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
+	io->read_completed = true;
+
+	if (next_pending_write(ctxt))
+		wake_up(&ctxt->wait);
+
+	closure_put(&ctxt->cl);
+}
+
+static void do_pending_writes(struct moving_context *ctxt)
+{
+	struct moving_io *io;
+
+	while ((io = next_pending_write(ctxt))) {
+		list_del(&io->list);
+		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+	}
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)			\
+do {								\
+	do_pending_writes(_ctxt);				\
+								\
+	if (_cond)						\
+		break;						\
+	__wait_event((_ctxt)->wait,				\
+		     next_pending_write(_ctxt) || (_cond));	\
+} while (1)
+
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+	move_ctxt_wait_event(ctxt,
+		!atomic_read(&ctxt->write_sectors) ||
+		atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
+static int bch2_move_extent(struct bch_fs *c,
+			    struct moving_context *ctxt,
+			    struct write_point_specifier wp,
+			    struct bch_io_opts io_opts,
+			    struct bkey_s_c_extent e,
+			    enum data_cmd data_cmd,
+			    struct data_opts data_opts)
+{
+	struct moving_io *io;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned sectors = e.k->size, pages;
+	int ret = -ENOMEM;
+
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->write_sectors) <
+		SECTORS_IN_FLIGHT_PER_DEVICE);
+
+	move_ctxt_wait_event(ctxt,
+		atomic_read(&ctxt->read_sectors) <
+		SECTORS_IN_FLIGHT_PER_DEVICE);
+
+	/* write path might have to decompress data: */
+	extent_for_each_ptr_crc(e, ptr, crc)
+		sectors = max_t(unsigned, sectors, crc.uncompressed_size);
+
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	if (!io)
+		goto err;
+
+	io->write.ctxt		= ctxt;
+	io->read_sectors	= e.k->size;
+	io->write_sectors	= e.k->size;
+
+	bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
+				 GFP_KERNEL))
+		goto err_free;
+
+	io->rbio.opts = io_opts;
+	bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
+	io->rbio.bio.bi_vcnt = pages;
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;
+
+	io->rbio.bio.bi_opf		= REQ_OP_READ;
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(e.k);
+	io->rbio.bio.bi_end_io		= move_read_endio;
+
+	ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
+				      data_cmd, data_opts, e.s_c);
+	if (ret)
+		goto err_free_pages;
+
+	atomic64_inc(&ctxt->stats->keys_moved);
+	atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+
+	trace_move_extent(e.k);
+
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
+	list_add_tail(&io->list, &ctxt->reads);
+
+	/*
+	 * dropped by move_read_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&ctxt->cl);
+	bch2_read_extent(c, &io->rbio, e.s_c,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
+	return 0;
+err_free_pages:
+	bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+	kfree(io);
+err:
+	trace_move_alloc_fail(e.k);
+	return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bch_ratelimit *rate,
+		   struct write_point_specifier wp,
+		   struct bpos start,
+		   struct bpos end,
+		   move_pred_fn pred, void *arg,
+		   struct bch_move_stats *stats)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct moving_context ctxt = { .stats = stats };
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	struct bkey_s_c_extent e;
+	struct data_opts data_opts;
+	enum data_cmd data_cmd;
+	u64 cur_inum = U64_MAX;
+	int ret = 0, ret2;
+
+	closure_init_stack(&ctxt.cl);
+	INIT_LIST_HEAD(&ctxt.reads);
+	init_waitqueue_head(&ctxt.wait);
+
+	stats->data_type = BCH_DATA_USER;
+	bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
+			     BTREE_ITER_PREFETCH);
+
+	if (rate)
+		bch2_ratelimit_reset(rate);
+
+	while (!kthread || !(ret = kthread_should_stop())) {
+		if (rate &&
+		    bch2_ratelimit_delay(rate) &&
+		    (bch2_btree_iter_unlock(&stats->iter),
+		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
+			break;
+peek:
+		k = bch2_btree_iter_peek(&stats->iter);
+		if (!k.k)
+			break;
+		ret = btree_iter_err(k);
+		if (ret)
+			break;
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bkey_extent_is_data(k.k))
+			goto next_nondata;
+
+		e = bkey_s_c_to_extent(k);
+
+		if (cur_inum != k.k->p.inode) {
+			struct bch_inode_unpacked inode;
+
+			/* don't hold btree locks while looking up inode: */
+			bch2_btree_iter_unlock(&stats->iter);
+
+			io_opts = bch2_opts_to_inode_opts(c->opts);
+			if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+				bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
+			cur_inum = k.k->p.inode;
+			goto peek;
+		}
+
+		switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
+					 &io_opts, &data_opts))) {
+		case DATA_SKIP:
+			goto next;
+		case DATA_SCRUB:
+			BUG();
+		case DATA_ADD_REPLICAS:
+		case DATA_REWRITE:
+		case DATA_PROMOTE:
+			break;
+		default:
+			BUG();
+		}
+
+		/* unlock before doing IO: */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&stats->iter);
+
+		ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
+					bkey_s_c_to_extent(k),
+					data_cmd, data_opts);
+		if (ret2) {
+			if (ret2 == -ENOMEM) {
+				/* memory allocation failure, wait for some IO to finish */
+				bch2_move_ctxt_wait_for_io(&ctxt);
+				continue;
+			}
+
+			/* XXX signal failure */
+			goto next;
+		}
+
+		if (rate)
+			bch2_ratelimit_increment(rate, k.k->size);
+next:
+		atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
+			     &stats->sectors_seen);
+next_nondata:
+		bch2_btree_iter_next(&stats->iter);
+		bch2_btree_iter_cond_resched(&stats->iter);
+	}
+
+	bch2_btree_iter_unlock(&stats->iter);
+
+	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+	closure_sync(&ctxt.cl);
+
+	EBUG_ON(atomic_read(&ctxt.write_sectors));
+
+	trace_move_data(c,
+			atomic64_read(&stats->sectors_moved),
+			atomic64_read(&stats->keys_moved));
+
+	return ret;
+}
+
+static int bch2_gc_data_replicas(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			   BTREE_ITER_PREFETCH, k) {
+		ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
+		if (ret)
+			break;
+	}
+	ret = bch2_btree_iter_unlock(&iter) ?: ret;
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
+static int bch2_gc_btree_replicas(struct bch_fs *c)
+{
+	struct btree_iter iter;
+	struct btree *b;
+	unsigned id;
+	int ret = 0;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+						      bkey_i_to_s_c(&b->key));
+
+			bch2_btree_iter_cond_resched(&iter);
+		}
+
+		ret = bch2_btree_iter_unlock(&iter) ?: ret;
+	}
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
+
+static int bch2_move_btree(struct bch_fs *c,
+			   move_pred_fn pred,
+			   void *arg,
+			   struct bch_move_stats *stats)
+{
+	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+	struct btree *b;
+	unsigned id;
+	struct data_opts data_opts;
+	enum data_cmd cmd;
+	int ret = 0;
+
+	stats->data_type = BCH_DATA_BTREE;
+
+	for (id = 0; id < BTREE_ID_NR; id++) {
+		for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+			switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
+					    bkey_i_to_s_c_extent(&b->key),
+					    &io_opts,
+					    &data_opts))) {
+			case DATA_SKIP:
+				goto next;
+			case DATA_SCRUB:
+				BUG();
+			case DATA_ADD_REPLICAS:
+			case DATA_REWRITE:
+				break;
+			default:
+				BUG();
+			}
+
+			ret = bch2_btree_node_rewrite(c, &stats->iter,
+					b->data->keys.seq, 0) ?: ret;
+next:
+			bch2_btree_iter_cond_resched(&stats->iter);
+		}
+
+		ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
+	}
+
+	return ret;
+}
+
+#if 0
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+				enum bkey_type type,
+				struct bkey_s_c_extent e,
+				struct bch_io_opts *io_opts,
+				struct data_opts *data_opts)
+{
+	return DATA_SCRUB;
+}
+#endif
+
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+				      enum bkey_type type,
+				      struct bkey_s_c_extent e,
+				      struct bch_io_opts *io_opts,
+				      struct data_opts *data_opts)
+{
+	unsigned nr_good = bch2_extent_durability(c, e);
+	unsigned replicas = type == BKEY_TYPE_BTREE
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
+
+	if (!nr_good || nr_good >= replicas)
+		return DATA_SKIP;
+
+	data_opts->target		= 0;
+	data_opts->btree_insert_flags = 0;
+	return DATA_ADD_REPLICAS;
+}
+
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+				  enum bkey_type type,
+				  struct bkey_s_c_extent e,
+				  struct bch_io_opts *io_opts,
+				  struct data_opts *data_opts)
+{
+	struct bch_ioctl_data *op = arg;
+
+	if (!bch2_extent_has_device(e, op->migrate.dev))
+		return DATA_SKIP;
+
+	data_opts->target		= 0;
+	data_opts->btree_insert_flags	= 0;
+	data_opts->rewrite_dev		= op->migrate.dev;
+	return DATA_REWRITE;
+}
+
+int bch2_data_job(struct bch_fs *c,
+		  struct bch_move_stats *stats,
+		  struct bch_ioctl_data op)
+{
+	int ret = 0;
+
+	switch (op.op) {
+	case BCH_DATA_OP_REREPLICATE:
+		stats->data_type = BCH_DATA_JOURNAL;
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
+
+		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+		ret = bch2_gc_btree_replicas(c) ?: ret;
+
+		ret = bch2_move_data(c, NULL,
+				     writepoint_hashed((unsigned long) current),
+				     op.start,
+				     op.end,
+				     rereplicate_pred, c, stats) ?: ret;
+		ret = bch2_gc_data_replicas(c) ?: ret;
+		break;
+	case BCH_DATA_OP_MIGRATE:
+		if (op.migrate.dev >= c->sb.nr_devices)
+			return -EINVAL;
+
+		stats->data_type = BCH_DATA_JOURNAL;
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
+
+		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+		ret = bch2_gc_btree_replicas(c) ?: ret;
+
+		ret = bch2_move_data(c, NULL,
+				     writepoint_hashed((unsigned long) current),
+				     op.start,
+				     op.end,
+				     migrate_pred, &op, stats) ?: ret;
+		ret = bch2_gc_data_replicas(c) ?: ret;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_H
+#define _BCACHEFS_MOVE_H
+
+#include "btree_iter.h"
+#include "buckets.h"
+#include "io_types.h"
+#include "move_types.h"
+
+struct bch_read_bio;
+struct moving_context;
+
+enum data_cmd {
+	DATA_SKIP,
+	DATA_SCRUB,
+	DATA_ADD_REPLICAS,
+	DATA_REWRITE,
+	DATA_PROMOTE,
+};
+
+struct data_opts {
+	u16		target;
+	unsigned	rewrite_dev;
+	int		btree_insert_flags;
+};
+
+struct migrate_write {
+	enum data_cmd		data_cmd;
+	struct data_opts	data_opts;
+
+	unsigned		nr_ptrs_reserved;
+
+	struct moving_context	*ctxt;
+
+	/* what we read: */
+	struct bch_extent_ptr	ptr;
+	u64			offset;
+
+	struct bch_write_op	op;
+};
+
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+			    struct write_point_specifier,
+			    struct bch_io_opts,
+			    enum data_cmd, struct data_opts,
+			    struct bkey_s_c);
+
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+				enum bkey_type, struct bkey_s_c_extent,
+				struct bch_io_opts *, struct data_opts *);
+
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+		   struct write_point_specifier,
+		   struct bpos, struct bpos,
+		   move_pred_fn, void *,
+		   struct bch_move_stats *);
+
+int bch2_data_job(struct bch_fs *,
+		  struct bch_move_stats *,
+		  struct bch_ioctl_data);
+
+#endif /* _BCACHEFS_MOVE_H */
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+	enum bch_data_type	data_type;
+	struct btree_iter	iter;
+
+	atomic64_t		keys_moved;
+	atomic64_t		sectors_moved;
+	atomic64_t		sectors_seen;
+	atomic64_t		sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "eytzinger.h"
+#include "io.h"
+#include "keylist.h"
+#include "move.h"
+#include "movinggc.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/sched/task.h>
+#include <linux/sort.h>
+#include <linux/wait.h>
+
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
+
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
+
+static inline int sectors_used_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
+{
+	return (l.sectors > r.sectors) - (l.sectors < r.sectors);
+}
+
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
+{
+	const struct copygc_heap_entry *l = _l;
+	const struct copygc_heap_entry *r = _r;
+
+	return (l->offset > r->offset) - (l->offset < r->offset);
+}
+
+static bool __copygc_pred(struct bch_dev *ca,
+			  struct bkey_s_c_extent e)
+{
+	copygc_heap *h = &ca->copygc_heap;
+	const struct bch_extent_ptr *ptr =
+		bch2_extent_has_device(e, ca->dev_idx);
+
+	if (ptr) {
+		struct copygc_heap_entry search = { .offset = ptr->offset };
+
+		ssize_t i = eytzinger0_find_le(h->data, h->used,
+					       sizeof(h->data[0]),
+					       bucket_offset_cmp, &search);
+
+		return (i >= 0 &&
+			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+			ptr->gen == h->data[i].gen);
+	}
+
+	return false;
+}
+
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+				 enum bkey_type type,
+				 struct bkey_s_c_extent e,
+				 struct bch_io_opts *io_opts,
+				 struct data_opts *data_opts)
+{
+	struct bch_dev *ca = arg;
+
+	if (!__copygc_pred(ca, e))
+		return DATA_SKIP;
+
+	data_opts->target		= dev_to_target(ca->dev_idx);
+	data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
+	data_opts->rewrite_dev		= ca->dev_idx;
+	return DATA_REWRITE;
+}
+
+static bool have_copygc_reserve(struct bch_dev *ca)
+{
+	bool ret;
+
+	spin_lock(&ca->freelist_lock);
+	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
+		ca->allocator_blocked;
+	spin_unlock(&ca->freelist_lock);
+
+	return ret;
+}
+
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
+{
+	copygc_heap *h = &ca->copygc_heap;
+	struct copygc_heap_entry e, *i;
+	struct bucket_array *buckets;
+	struct bch_move_stats move_stats;
+	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 buckets_to_move, buckets_not_moved = 0;
+	size_t b;
+	int ret;
+
+	memset(&move_stats, 0, sizeof(move_stats));
+	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+
+	/*
+	 * Find buckets with lowest sector counts, skipping completely
+	 * empty buckets, by building a maxheap sorted by sector count,
+	 * and repeatedly replacing the maximum element until all
+	 * buckets have been visited.
+	 */
+	h->used = 0;
+
+	/*
+	 * We need bucket marks to be up to date - gc can't be recalculating
+	 * them:
+	 */
+	down_read(&c->gc_lock);
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+
+	for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		struct copygc_heap_entry e;
+
+		if (m.owned_by_allocator ||
+		    m.data_type != BCH_DATA_USER ||
+		    !bucket_sectors_used(m) ||
+		    bucket_sectors_used(m) >= ca->mi.bucket_size)
+			continue;
+
+		e = (struct copygc_heap_entry) {
+			.gen		= m.gen,
+			.sectors	= bucket_sectors_used(m),
+			.offset		= bucket_to_sector(ca, b),
+		};
+		heap_add_or_replace(h, e, -sectors_used_cmp);
+	}
+	up_read(&ca->bucket_lock);
+	up_read(&c->gc_lock);
+
+	for (i = h->data; i < h->data + h->used; i++)
+		sectors_to_move += i->sectors;
+
+	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
+		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
+		sectors_to_move -= e.sectors;
+	}
+
+	buckets_to_move = h->used;
+
+	if (!buckets_to_move)
+		return;
+
+	eytzinger0_sort(h->data, h->used,
+			sizeof(h->data[0]),
+			bucket_offset_cmp, NULL);
+
+	ret = bch2_move_data(c, &ca->copygc_pd.rate,
+			     writepoint_ptr(&ca->copygc_write_point),
+			     POS_MIN, POS_MAX,
+			     copygc_pred, ca,
+			     &move_stats);
+
+	down_read(&ca->bucket_lock);
+	buckets = bucket_array(ca);
+	for (i = h->data; i < h->data + h->used; i++) {
+		size_t b = sector_to_bucket(ca, i->offset);
+		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+
+		if (i->gen == m.gen && bucket_sectors_used(m)) {
+			sectors_not_moved += bucket_sectors_used(m);
+			buckets_not_moved++;
+		}
+	}
+	up_read(&ca->bucket_lock);
+
+	if (sectors_not_moved && !ret)
+		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
+
+	trace_copygc(ca,
+		     atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
+		     buckets_to_move, buckets_not_moved);
+}
+
+static int bch2_copygc_thread(void *arg)
+{
+	struct bch_dev *ca = arg;
+	struct bch_fs *c = ca->fs;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct bch_dev_usage usage;
+	unsigned long last;
+	u64 available, fragmented, reserve, next;
+
+	set_freezable();
+
+	while (!kthread_should_stop()) {
+		if (kthread_wait_freezable(c->copy_gc_enabled))
+			break;
+
+		last = atomic_long_read(&clock->now);
+
+		reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
+				 ca->mi.bucket_size *
+				 c->opts.gc_reserve_percent, 200);
+
+		usage = bch2_dev_usage_read(c, ca);
+
+		/*
+		 * don't start copygc until less than half the gc reserve is
+		 * available:
+		 */
+		available = __dev_buckets_available(ca, usage) *
+			ca->mi.bucket_size;
+		if (available > reserve) {
+			next = last + available - reserve;
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		/*
+		 * don't start copygc until there's more than half the copygc
+		 * reserve of fragmented space:
+		 */
+		fragmented = usage.sectors_fragmented;
+		if (fragmented < reserve) {
+			next = last + reserve - fragmented;
+			bch2_kthread_io_clock_wait(clock, next,
+					MAX_SCHEDULE_TIMEOUT);
+			continue;
+		}
+
+		bch2_copygc(c, ca);
+	}
+
+	return 0;
+}
+
+void bch2_copygc_stop(struct bch_dev *ca)
+{
+	ca->copygc_pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&ca->copygc_pd.rate);
+
+	if (ca->copygc_thread) {
+		kthread_stop(ca->copygc_thread);
+		put_task_struct(ca->copygc_thread);
+	}
+	ca->copygc_thread = NULL;
+}
+
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct task_struct *t;
+
+	BUG_ON(ca->copygc_thread);
+
+	if (c->opts.nochanges)
+		return 0;
+
+	if (bch2_fs_init_fault("copygc_start"))
+		return -ENOMEM;
+
+	t = kthread_create(bch2_copygc_thread, ca,
+			   "bch_copygc[%s]", ca->name);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	get_task_struct(t);
+
+	ca->copygc_thread = t;
+	wake_up_process(ca->copygc_thread);
+
+	return 0;
+}
+
+void bch2_dev_copygc_init(struct bch_dev *ca)
+{
+	bch2_pd_controller_init(&ca->copygc_pd);
+	ca->copygc_pd.d_term = 0;
+}
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_MOVINGGC_H
+#define _BCACHEFS_MOVINGGC_H
+
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);
+
+#endif /* _BCACHEFS_MOVINGGC_H */
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "super-io.h"
+#include "util.h"
+
+const char * const bch2_error_actions[] = {
+	"continue",
+	"remount-ro",
+	"panic",
+	NULL
+};
+
+const char * const bch2_csum_types[] = {
+	"none",
+	"crc32c",
+	"crc64",
+	NULL
+};
+
+const char * const bch2_compression_types[] = {
+	"none",
+	"lz4",
+	"gzip",
+	"zstd",
+	NULL
+};
+
+const char * const bch2_str_hash_types[] = {
+	"crc32c",
+	"crc64",
+	"siphash",
+	NULL
+};
+
+const char * const bch2_data_types[] = {
+	"none",
+	"sb",
+	"journal",
+	"btree",
+	"data",
+	"cached",
+	NULL
+};
+
+const char * const bch2_cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch2_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+const char * const bch2_dev_state[] = {
+	"readwrite",
+	"readonly",
+	"failed",
+	"spare",
+	NULL
+};
+
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
+{
+#define BCH_OPT(_name, ...)						\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+
+	BCH_OPTS()
+#undef BCH_OPT
+}
+
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		return opt_defined(*opts, _name);
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		return opts->_name;
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
+{
+	switch (id) {
+#define BCH_OPT(_name, ...)						\
+	case Opt_##_name:						\
+		opt_set(*opts, _name, v);				\
+		break;
+	BCH_OPTS()
+#undef BCH_OPT
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
+{
+	struct bch_opts opts = bch2_opts_empty();
+
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+	if (_sb_opt != NO_SB_OPT)					\
+		opt_set(opts, _name, _sb_opt(sb));
+	BCH_OPTS()
+#undef BCH_OPT
+
+	return opts;
+}
+
+const struct bch_option bch2_opt_table[] = {
+#define OPT_BOOL()		.type = BCH_OPT_BOOL
+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
+#define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
+#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
+				.parse = _fn##_parse,			\
+				.print = _fn##_print
+
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+	[Opt_##_name] = {						\
+		.attr	= {						\
+			.name	= #_name,				\
+			.mode = _mode == OPT_RUNTIME ? 0644 : 0444,	\
+		},							\
+		.mode	= _mode,					\
+		.set_sb	= SET_##_sb_opt,				\
+		_type							\
+	},
+
+	BCH_OPTS()
+#undef BCH_OPT
+};
+
+int bch2_opt_lookup(const char *name)
+{
+	const struct bch_option *i;
+
+	for (i = bch2_opt_table;
+	     i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
+	     i++)
+		if (!strcmp(name, i->attr.name))
+			return i - bch2_opt_table;
+
+	return -1;
+}
+
+struct synonym {
+	const char	*s1, *s2;
+};
+
+static const struct synonym bch_opt_synonyms[] = {
+	{ "quota",	"usrquota" },
+};
+
+static int bch2_mount_opt_lookup(const char *name)
+{
+	const struct synonym *i;
+
+	for (i = bch_opt_synonyms;
+	     i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+	     i++)
+		if (!strcmp(name, i->s1))
+			name = i->s2;
+
+	return bch2_opt_lookup(name);
+}
+
+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+		   const char *val, u64 *res)
+{
+	ssize_t ret;
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+		ret = kstrtou64(val, 10, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res > 1)
+			return -ERANGE;
+		break;
+	case BCH_OPT_UINT:
+		ret = kstrtou64(val, 10, res);
+		if (ret < 0)
+			return ret;
+
+		if (*res < opt->min || *res >= opt->max)
+			return -ERANGE;
+		break;
+	case BCH_OPT_STR:
+		ret = match_string(opt->choices, -1, val);
+		if (ret < 0)
+			return ret;
+
+		*res = ret;
+		break;
+	case BCH_OPT_FN:
+		if (!c)
+			return -EINVAL;
+
+		return opt->parse(c, val, res);
+	}
+
+	return 0;
+}
+
+int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
+		     const struct bch_option *opt, u64 v,
+		     unsigned flags)
+{
+	char *out = buf, *end = buf + len;
+
+	if (flags & OPT_SHOW_MOUNT_STYLE) {
+		if (opt->type == BCH_OPT_BOOL)
+			return scnprintf(out, end - out, "%s%s",
+					 v ? "" : "no",
+					 opt->attr.name);
+
+		out += scnprintf(out, end - out, "%s=", opt->attr.name);
+	}
+
+	switch (opt->type) {
+	case BCH_OPT_BOOL:
+	case BCH_OPT_UINT:
+		out += scnprintf(out, end - out, "%lli", v);
+		break;
+	case BCH_OPT_STR:
+		out += (flags & OPT_SHOW_FULL_LIST)
+			? bch2_scnprint_string_list(out, end - out, opt->choices, v)
+			: scnprintf(out, end - out, opt->choices[v]);
+		break;
+	case BCH_OPT_FN:
+		return opt->print(c, out, end - out, v);
+	default:
+		BUG();
+	}
+
+	return out - buf;
+}
+
+int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
+{
+	char *opt, *name, *val;
+	int ret, id;
+	u64 v;
+
+	while ((opt = strsep(&options, ",")) != NULL) {
+		name	= strsep(&opt, "=");
+		val	= opt;
+
+		if (val) {
+			id = bch2_mount_opt_lookup(name);
+			if (id < 0)
+				goto bad_opt;
+
+			ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
+			if (ret < 0)
+				goto bad_val;
+		} else {
+			id = bch2_mount_opt_lookup(name);
+			v = 1;
+
+			if (id < 0 &&
+			    !strncmp("no", name, 2)) {
+				id = bch2_mount_opt_lookup(name + 2);
+				v = 0;
+			}
+
+			if (id < 0)
+				goto bad_opt;
+
+			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
+				goto no_val;
+		}
+
+		if (bch2_opt_table[id].mode < OPT_MOUNT)
+			goto bad_opt;
+
+		if (id == Opt_acl &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
+			goto bad_opt;
+
+		if ((id == Opt_usrquota ||
+		     id == Opt_grpquota) &&
+		    !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
+			goto bad_opt;
+
+		bch2_opt_set_by_id(opts, id, v);
+	}
+
+	return 0;
+bad_opt:
+	pr_err("Bad mount option %s", name);
+	return -1;
+bad_val:
+	pr_err("Invalid value %s for mount option %s", val, name);
+	return -1;
+no_val:
+	pr_err("Mount option %s requires a value", name);
+	return -1;
+}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+	struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+	struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(ret, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits)					\
+	if (opt_defined(src, _name))					\
+		opt_set(*dst, _name, src._name);
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+	static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits)	Opt_##_name,
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+	};
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+		if (inode_opt_list[i] == id)
+			return true;
+
+	return false;
+}
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_OPTS_H
+#define _BCACHEFS_OPTS_H
+
+#include <linux/bug.h>
+#include <linux/log2.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include "bcachefs_format.h"
+
+extern const char * const bch2_error_actions[];
+extern const char * const bch2_csum_types[];
+extern const char * const bch2_compression_types[];
+extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_data_types[];
+extern const char * const bch2_cache_replacement_policies[];
+extern const char * const bch2_cache_modes[];
+extern const char * const bch2_dev_state[];
+
+/*
+ * Mount options; we also store defaults in the superblock.
+ *
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
+ * the superblock, changing it via sysfs (currently? might change this) also
+ * updates the superblock.
+ *
+ * We store options as signed integers, where -1 means undefined. This means we
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
+ * apply the options from that struct that are defined.
+ */
+
+/* dummy option, for options that aren't stored in the superblock */
+LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
+
+enum opt_mode {
+	OPT_INTERNAL,
+	OPT_FORMAT,
+	OPT_MOUNT,
+	OPT_RUNTIME,
+};
+
+enum opt_type {
+	BCH_OPT_BOOL,
+	BCH_OPT_UINT,
+	BCH_OPT_STR,
+	BCH_OPT_FN,
+};
+
+/**
+ * BCH_OPT(name, type, in mem type, mode, sb_opt)
+ *
+ * @name	- name of mount option, sysfs attribute, and struct bch_opts
+ *		  member
+ *
+ * @mode	- when opt may be set
+ *
+ * @sb_option	- name of corresponding superblock option
+ *
+ * @type	- one of OPT_BOOL, OPT_UINT, OPT_STR
+ */
+
+/*
+ * XXX: add fields for
+ *  - default value
+ *  - helptext
+ */
+
+#define BCH_OPTS()							\
+	BCH_OPT(block_size,		u16,	OPT_FORMAT,		\
+		OPT_UINT(1, 128),					\
+		BCH_SB_BLOCK_SIZE,		8)			\
+	BCH_OPT(btree_node_size,	u16,	OPT_FORMAT,		\
+		OPT_UINT(1, 128),					\
+		BCH_SB_BTREE_NODE_SIZE,		512)			\
+	BCH_OPT(errors,			u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_error_actions),				\
+		BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_RO)	\
+	BCH_OPT(metadata_replicas,	u8,	OPT_RUNTIME,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_META_REPLICAS_WANT,	1)			\
+	BCH_OPT(data_replicas,		u8,	OPT_RUNTIME,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_DATA_REPLICAS_WANT,	1)			\
+	BCH_OPT(metadata_replicas_required, u8,	OPT_MOUNT,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_META_REPLICAS_REQ,	1)			\
+	BCH_OPT(data_replicas_required, u8,	OPT_MOUNT,		\
+		OPT_UINT(1, BCH_REPLICAS_MAX),				\
+		BCH_SB_DATA_REPLICAS_REQ,	1)			\
+	BCH_OPT(metadata_checksum,	u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_csum_types),				\
+		BCH_SB_META_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
+	BCH_OPT(data_checksum,		u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_csum_types),				\
+		BCH_SB_DATA_CSUM_TYPE,		BCH_CSUM_OPT_CRC32C)	\
+	BCH_OPT(compression,		u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_compression_types),			\
+		BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_NONE)\
+	BCH_OPT(background_compression,	u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_compression_types),			\
+		BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
+	BCH_OPT(str_hash,		u8,	OPT_RUNTIME,		\
+		OPT_STR(bch2_str_hash_types),				\
+		BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_SIPHASH)	\
+	BCH_OPT(foreground_target,	u16,	OPT_RUNTIME,		\
+		OPT_FN(bch2_opt_target),				\
+		BCH_SB_FOREGROUND_TARGET,	0)			\
+	BCH_OPT(background_target,	u16,	OPT_RUNTIME,		\
+		OPT_FN(bch2_opt_target),				\
+		BCH_SB_BACKGROUND_TARGET,	0)			\
+	BCH_OPT(promote_target,		u16,	OPT_RUNTIME,		\
+		OPT_FN(bch2_opt_target),				\
+		BCH_SB_PROMOTE_TARGET,	0)				\
+	BCH_OPT(inodes_32bit,		u8,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_INODE_32BIT,		false)			\
+	BCH_OPT(gc_reserve_percent,	u8,	OPT_MOUNT,		\
+		OPT_UINT(5, 21),					\
+		BCH_SB_GC_RESERVE,		8)			\
+	BCH_OPT(root_reserve_percent,	u8,	OPT_MOUNT,		\
+		OPT_UINT(0, 100),					\
+		BCH_SB_ROOT_RESERVE,		0)			\
+	BCH_OPT(wide_macs,		u8,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		BCH_SB_128_BIT_MACS,		false)			\
+	BCH_OPT(acl,			u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_POSIX_ACL,		true)			\
+	BCH_OPT(usrquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_USRQUOTA,		false)			\
+	BCH_OPT(grpquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_GRPQUOTA,		false)			\
+	BCH_OPT(prjquota,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		BCH_SB_PRJQUOTA,		false)			\
+	BCH_OPT(degraded,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(discard,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(verbose_recovery,	u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(verbose_init,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(journal_flush_disabled, u8,	OPT_RUNTIME,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(nofsck,			u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(fix_errors,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(nochanges,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(noreplay,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(norecovery,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(noexcl,			u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(sb,			u64,	OPT_MOUNT,		\
+		OPT_UINT(0, S64_MAX),					\
+		NO_SB_OPT,			BCH_SB_SECTOR)		\
+	BCH_OPT(read_only,		u8,	OPT_INTERNAL,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(nostart,		u8,	OPT_INTERNAL,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)			\
+	BCH_OPT(no_data_io,		u8,	OPT_MOUNT,		\
+		OPT_BOOL(),						\
+		NO_SB_OPT,			false)
+
+struct bch_opts {
+#define BCH_OPT(_name, _bits, ...)	unsigned _name##_defined:1;
+	BCH_OPTS()
+#undef BCH_OPT
+
+#define BCH_OPT(_name, _bits, ...)	_bits	_name;
+	BCH_OPTS()
+#undef BCH_OPT
+};
+
+static const struct bch_opts bch2_opts_default = {
+#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)		\
+	._name##_defined = true,					\
+	._name = _default,						\
+
+	BCH_OPTS()
+#undef BCH_OPT
+};
+
+#define opt_defined(_opts, _name)	((_opts)._name##_defined)
+
+#define opt_get(_opts, _name)						\
+	(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
+
+#define opt_set(_opts, _name, _v)					\
+do {									\
+	(_opts)._name##_defined = true;					\
+	(_opts)._name = _v;						\
+} while (0)
+
+static inline struct bch_opts bch2_opts_empty(void)
+{
+	return (struct bch_opts) { 0 };
+}
+
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
+
+enum bch_opt_id {
+#define BCH_OPT(_name, ...)	Opt_##_name,
+	BCH_OPTS()
+#undef BCH_OPT
+	bch2_opts_nr
+};
+
+struct bch_fs;
+
+struct bch_option {
+	struct attribute	attr;
+	void			(*set_sb)(struct bch_sb *, u64);
+	enum opt_mode		mode;
+	enum opt_type		type;
+
+	union {
+	struct {
+		u64		min, max;
+	};
+	struct {
+		const char * const *choices;
+	};
+	struct {
+		int (*parse)(struct bch_fs *, const char *, u64 *);
+		int (*print)(struct bch_fs *, char *, size_t, u64);
+	};
+	};
+
+};
+
+extern const struct bch_option bch2_opt_table[];
+
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
+
+struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+
+int bch2_opt_lookup(const char *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+
+#define OPT_SHOW_FULL_LIST	(1 << 0)
+#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
+
+int bch2_opt_to_text(struct bch_fs *, char *, size_t,
+		     const struct bch_option *, u64, unsigned);
+
+int bch2_parse_mount_opts(struct bch_opts *, char *);
+
+/* inode opts: */
+
+#define BCH_INODE_OPTS()					\
+	BCH_INODE_OPT(data_checksum,			8)	\
+	BCH_INODE_OPT(compression,			8)	\
+	BCH_INODE_OPT(background_compression,		8)	\
+	BCH_INODE_OPT(data_replicas,			8)	\
+	BCH_INODE_OPT(promote_target,			16)	\
+	BCH_INODE_OPT(foreground_target,		16)	\
+	BCH_INODE_OPT(background_target,		16)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits)	unsigned _name##_defined:1;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits)	u##_bits _name;
+	BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
+#endif /* _BCACHEFS_OPTS_H */
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@ -0,0 +1,790 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "inode.h"
+#include "quota.h"
+#include "super-io.h"
+
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) != sizeof(*q))
+		return "invalid field quota: wrong size";
+
+	return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+
+	if (k.k->p.inode >= QTYP_NR)
+		return "invalid quota type";
+
+	switch (k.k->type) {
+	case BCH_QUOTA: {
+		dq = bkey_s_c_to_quota(k);
+
+		if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
+			return "incorrect value size";
+
+		return NULL;
+	}
+	default:
+		return "invalid type";
+	}
+}
+
+static const char * const bch2_quota_counters[] = {
+	"space",
+	"inodes",
+};
+
+void bch2_quota_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end= buf + size;
+	struct bkey_s_c_quota dq;
+	unsigned i;
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		dq = bkey_s_c_to_quota(k);
+
+		for (i = 0; i < Q_COUNTERS; i++)
+			out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
+					 bch2_quota_counters[i],
+					 le64_to_cpu(dq.v->c[i].hardlimit),
+					 le64_to_cpu(dq.v->c[i].softlimit));
+		break;
+	}
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
+{
+	qtypes >>= i;
+	return qtypes ? i + __ffs(qtypes) : QTYP_NR;
+}
+
+#define for_each_set_qtype(_c, _i, _q, _qtypes)				\
+	for (_i = 0;							\
+	     (_i = __next_qtype(_i, _qtypes),				\
+	      _q = &(_c)->quotas[_i],					\
+	      _i < QTYP_NR);						\
+	     _i++)
+
+static bool ignore_hardlimit(struct bch_memquota_type *q)
+{
+	if (capable(CAP_SYS_RESOURCE))
+		return true;
+#if 0
+	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
+
+	return capable(CAP_SYS_RESOURCE) &&
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
+#endif
+	return false;
+}
+
+enum quota_msg {
+	SOFTWARN,	/* Softlimit reached */
+	SOFTLONGWARN,	/* Grace time expired */
+	HARDWARN,	/* Hardlimit reached */
+
+	HARDBELOW,	/* Usage got below inode hardlimit */
+	SOFTBELOW,	/* Usage got below inode softlimit */
+};
+
+static int quota_nl[][Q_COUNTERS] = {
+	[HARDWARN][Q_SPC]	= QUOTA_NL_BHARDWARN,
+	[SOFTLONGWARN][Q_SPC]	= QUOTA_NL_BSOFTLONGWARN,
+	[SOFTWARN][Q_SPC]	= QUOTA_NL_BSOFTWARN,
+	[HARDBELOW][Q_SPC]	= QUOTA_NL_BHARDBELOW,
+	[SOFTBELOW][Q_SPC]	= QUOTA_NL_BSOFTBELOW,
+
+	[HARDWARN][Q_INO]	= QUOTA_NL_IHARDWARN,
+	[SOFTLONGWARN][Q_INO]	= QUOTA_NL_ISOFTLONGWARN,
+	[SOFTWARN][Q_INO]	= QUOTA_NL_ISOFTWARN,
+	[HARDBELOW][Q_INO]	= QUOTA_NL_IHARDBELOW,
+	[SOFTBELOW][Q_INO]	= QUOTA_NL_ISOFTBELOW,
+};
+
+struct quota_msgs {
+	u8		nr;
+	struct {
+		u8	qtype;
+		u8	msg;
+	}		m[QTYP_NR * Q_COUNTERS];
+};
+
+static void prepare_msg(unsigned qtype,
+			enum quota_counters counter,
+			struct quota_msgs *msgs,
+			enum quota_msg msg_type)
+{
+	BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
+
+	msgs->m[msgs->nr].qtype	= qtype;
+	msgs->m[msgs->nr].msg	= quota_nl[msg_type][counter];
+	msgs->nr++;
+}
+
+static void prepare_warning(struct memquota_counter *qc,
+			    unsigned qtype,
+			    enum quota_counters counter,
+			    struct quota_msgs *msgs,
+			    enum quota_msg msg_type)
+{
+	if (qc->warning_issued & (1 << msg_type))
+		return;
+
+	prepare_msg(qtype, counter, msgs, msg_type);
+}
+
+static void flush_warnings(struct bch_qid qid,
+			   struct super_block *sb,
+			   struct quota_msgs *msgs)
+{
+	unsigned i;
+
+	for (i = 0; i < msgs->nr; i++)
+		quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
+				   sb->s_dev, msgs->m[i].msg);
+}
+
+static int bch2_quota_check_limit(struct bch_fs *c,
+				  unsigned qtype,
+				  struct bch_memquota *mq,
+				  struct quota_msgs *msgs,
+				  enum quota_counters counter,
+				  s64 v,
+				  enum quota_acct_mode mode)
+{
+	struct bch_memquota_type *q = &c->quotas[qtype];
+	struct memquota_counter *qc = &mq->c[counter];
+	u64 n = qc->v + v;
+
+	BUG_ON((s64) n < 0);
+
+	if (mode == BCH_QUOTA_NOCHECK)
+		return 0;
+
+	if (v <= 0) {
+		if (n < qc->hardlimit &&
+		    (qc->warning_issued & (1 << HARDWARN))) {
+			qc->warning_issued &= ~(1 << HARDWARN);
+			prepare_msg(qtype, counter, msgs, HARDBELOW);
+		}
+
+		if (n < qc->softlimit &&
+		    (qc->warning_issued & (1 << SOFTWARN))) {
+			qc->warning_issued &= ~(1 << SOFTWARN);
+			prepare_msg(qtype, counter, msgs, SOFTBELOW);
+		}
+
+		qc->warning_issued = 0;
+		return 0;
+	}
+
+	if (qc->hardlimit &&
+	    qc->hardlimit < n &&
+	    !ignore_hardlimit(q)) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer &&
+	    ktime_get_real_seconds() >= qc->timer &&
+	    !ignore_hardlimit(q)) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
+	}
+
+	if (qc->softlimit &&
+	    qc->softlimit < n &&
+	    qc->timer == 0) {
+		if (mode == BCH_QUOTA_PREALLOC)
+			return -EDQUOT;
+
+		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
+
+		/* XXX is this the right one? */
+		qc->timer = ktime_get_real_seconds() +
+			q->limits[counter].warnlimit;
+	}
+
+	return 0;
+}
+
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+		    enum quota_counters counter, s64 v,
+		    enum quota_acct_mode mode)
+{
+	unsigned qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq[QTYP_NR];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
+		if (!mq[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mq[i]->c[counter].v += v;
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(qid, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
+				  struct bch_memquota *dst_q,
+				  enum quota_counters counter, s64 v)
+{
+	BUG_ON(v > src_q->c[counter].v);
+	BUG_ON(v + dst_q->c[counter].v < v);
+
+	src_q->c[counter].v -= v;
+	dst_q->c[counter].v += v;
+}
+
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+			struct bch_qid dst,
+			struct bch_qid src, u64 space)
+{
+	struct bch_memquota_type *q;
+	struct bch_memquota *src_q[3], *dst_q[3];
+	struct quota_msgs msgs;
+	unsigned i;
+	int ret = 0;
+
+	qtypes &= enabled_qtypes(c);
+
+	memset(&msgs, 0, sizeof(msgs));
+
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_lock_nested(&q->lock, i);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
+		dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
+
+		if (!src_q[i] || !dst_q[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
+					     dst_q[i]->c[Q_SPC].v + space,
+					     BCH_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+
+		ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
+					     dst_q[i]->c[Q_INO].v + 1,
+					     BCH_QUOTA_PREALLOC);
+		if (ret)
+			goto err;
+	}
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
+		__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
+	}
+
+err:
+	for_each_set_qtype(c, i, q, qtypes)
+		mutex_unlock(&q->lock);
+
+	flush_warnings(dst, c->vfs_sb, &msgs);
+
+	return ret;
+}
+
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_s_c_quota dq;
+	struct bch_memquota_type *q;
+	struct bch_memquota *mq;
+	unsigned i;
+
+	BUG_ON(k.k->p.inode >= QTYP_NR);
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		dq = bkey_s_c_to_quota(k);
+		q = &c->quotas[k.k->p.inode];
+
+		mutex_lock(&q->lock);
+		mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
+		if (!mq) {
+			mutex_unlock(&q->lock);
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < Q_COUNTERS; i++) {
+			mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
+			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
+		}
+
+		mutex_unlock(&q->lock);
+	}
+
+	return 0;
+}
+
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
+			   BTREE_ITER_PREFETCH, k) {
+		if (k.k->p.inode != type)
+			break;
+
+		ret = __bch2_quota_set(c, k);
+		if (ret)
+			break;
+	}
+
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+void bch2_fs_quota_exit(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		genradix_free(&c->quotas[i].table);
+}
+
+void bch2_fs_quota_init(struct bch_fs *c)
+{
+	unsigned i;
+
+	for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
+		mutex_init(&c->quotas[i].lock);
+}
+
+static void bch2_sb_quota_read(struct bch_fs *c)
+{
+	struct bch_sb_field_quota *sb_quota;
+	unsigned i, j;
+
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	if (!sb_quota)
+		return;
+
+	for (i = 0; i < QTYP_NR; i++) {
+		struct bch_memquota_type *q = &c->quotas[i];
+
+		for (j = 0; j < Q_COUNTERS; j++) {
+			q->limits[j].timelimit =
+				le32_to_cpu(sb_quota->q[i].c[j].timelimit);
+			q->limits[j].warnlimit =
+				le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
+		}
+	}
+}
+
+int bch2_fs_quota_read(struct bch_fs *c)
+{
+	unsigned i, qtypes = enabled_qtypes(c);
+	struct bch_memquota_type *q;
+	struct btree_iter iter;
+	struct bch_inode_unpacked u;
+	struct bkey_s_c k;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	bch2_sb_quota_read(c);
+	mutex_unlock(&c->sb_lock);
+
+	for_each_set_qtype(c, i, q, qtypes) {
+		ret = bch2_quota_init_type(c, i);
+		if (ret)
+			return ret;
+	}
+
+	for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
+			   BTREE_ITER_PREFETCH, k) {
+		switch (k.k->type) {
+		case BCH_INODE_FS:
+			ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+			if (ret)
+				return ret;
+
+			bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+					BCH_QUOTA_NOCHECK);
+			bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+					BCH_QUOTA_NOCHECK);
+		}
+	}
+	return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/* Enable/disable/delete quotas for an entire filesystem: */
+
+static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	/* Accounting must be enabled at mount time: */
+	if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
+		return -EINVAL;
+
+	/* Can't enable enforcement without accounting: */
+	if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
+		return -EINVAL;
+
+	if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
+		return -EINVAL;
+
+	if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
+		return -EINVAL;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&c->sb_lock);
+	if (uflags & FS_QUOTA_UDQ_ENFD)
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_GDQ_ENFD)
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
+
+	if (uflags & FS_QUOTA_PDQ_ENFD)
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (uflags & FS_USER_QUOTA) {
+		if (c->opts.usrquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_USR, 0),
+					      POS(QTYP_USR + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_GROUP_QUOTA) {
+		if (c->opts.grpquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_GRP, 0),
+					      POS(QTYP_GRP + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	if (uflags & FS_PROJ_QUOTA) {
+		if (c->opts.prjquota)
+			return -EINVAL;
+
+		ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
+					      POS(QTYP_PRJ, 0),
+					      POS(QTYP_PRJ + 1, 0),
+					      ZERO_VERSION, NULL, NULL, NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return quota status information, such as enforcements, quota file inode
+ * numbers etc.
+ */
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	unsigned qtypes = enabled_qtypes(c);
+	unsigned i;
+
+	memset(state, 0, sizeof(*state));
+
+	for (i = 0; i < QTYP_NR; i++) {
+		state->s_state[i].flags |= QCI_SYSFILE;
+
+		if (!(qtypes & (1 << i)))
+			continue;
+
+		state->s_state[i].flags |= QCI_ACCT_ENABLED;
+
+		state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
+		state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
+
+		state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
+		state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
+	}
+
+	return 0;
+}
+
+/*
+ * Adjust quota timers & warnings
+ */
+static int bch2_quota_set_info(struct super_block *sb, int type,
+			       struct qc_info *info)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct bch_sb_field_quota *sb_quota;
+	struct bch_memquota_type *q;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	if (type >= QTYP_NR)
+		return -EINVAL;
+
+	if (!((1 << type) & enabled_qtypes(c)))
+		return -ESRCH;
+
+	if (info->i_fieldmask &
+	    ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
+		return -EINVAL;
+
+	q = &c->quotas[type];
+
+	mutex_lock(&c->sb_lock);
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+	if (!sb_quota) {
+		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+					sizeof(*sb_quota) / sizeof(u64));
+		if (!sb_quota)
+			return -ENOSPC;
+	}
+
+	if (info->i_fieldmask & QC_SPC_TIMER)
+		sb_quota->q[type].c[Q_SPC].timelimit =
+			cpu_to_le32(info->i_spc_timelimit);
+
+	if (info->i_fieldmask & QC_SPC_WARNS)
+		sb_quota->q[type].c[Q_SPC].warnlimit =
+			cpu_to_le32(info->i_spc_warnlimit);
+
+	if (info->i_fieldmask & QC_INO_TIMER)
+		sb_quota->q[type].c[Q_INO].timelimit =
+			cpu_to_le32(info->i_ino_timelimit);
+
+	if (info->i_fieldmask & QC_INO_WARNS)
+		sb_quota->q[type].c[Q_INO].warnlimit =
+			cpu_to_le32(info->i_ino_warnlimit);
+
+	bch2_sb_quota_read(c);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Get/set individual quotas: */
+
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
+{
+	dst->d_space		= src->c[Q_SPC].v << 9;
+	dst->d_spc_hardlimit	= src->c[Q_SPC].hardlimit << 9;
+	dst->d_spc_softlimit	= src->c[Q_SPC].softlimit << 9;
+	dst->d_spc_timer	= src->c[Q_SPC].timer;
+	dst->d_spc_warns	= src->c[Q_SPC].warns;
+
+	dst->d_ino_count	= src->c[Q_INO].v;
+	dst->d_ino_hardlimit	= src->c[Q_INO].hardlimit;
+	dst->d_ino_softlimit	= src->c[Q_INO].softlimit;
+	dst->d_ino_timer	= src->c[Q_INO].timer;
+	dst->d_ino_warns	= src->c[Q_INO].warns;
+}
+
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid.type];
+	qid_t qid			= from_kqid(&init_user_ns, kqid);
+	struct bch_memquota *mq;
+
+	memset(qdq, 0, sizeof(*qdq));
+
+	mutex_lock(&q->lock);
+	mq = genradix_ptr(&q->table, qid);
+	if (mq)
+		__bch2_quota_get(qdq, mq);
+	mutex_unlock(&q->lock);
+
+	return 0;
+}
+
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
+			       struct qc_dqblk *qdq)
+{
+	struct bch_fs *c		= sb->s_fs_info;
+	struct bch_memquota_type *q	= &c->quotas[kqid->type];
+	qid_t qid			= from_kqid(&init_user_ns, *kqid);
+	struct genradix_iter iter	= genradix_iter_init(&q->table, qid);
+	struct bch_memquota *mq;
+	int ret = 0;
+
+	mutex_lock(&q->lock);
+
+	while ((mq = genradix_iter_peek(&iter, &q->table))) {
+		if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
+			__bch2_quota_get(qdq, mq);
+			*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
+			goto found;
+		}
+
+		genradix_iter_advance(&iter, &q->table);
+	}
+
+	ret = -ENOENT;
+found:
+	mutex_unlock(&q->lock);
+	return ret;
+}
+
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+			  struct qc_dqblk *qdq)
+{
+	struct bch_fs *c = sb->s_fs_info;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_quota new_quota;
+	int ret;
+
+	if (sb->s_flags & SB_RDONLY)
+		return -EROFS;
+
+	bkey_quota_init(&new_quota.k_i);
+	new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+	k = bch2_btree_iter_peek_slot(&iter);
+
+	ret = btree_iter_err(k);
+	if (unlikely(ret))
+		return ret;
+
+	switch (k.k->type) {
+	case BCH_QUOTA:
+		new_quota.v = *bkey_s_c_to_quota(k).v;
+		break;
+	}
+
+	if (qdq->d_fieldmask & QC_SPC_SOFT)
+		new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+	if (qdq->d_fieldmask & QC_SPC_HARD)
+		new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+
+	if (qdq->d_fieldmask & QC_INO_SOFT)
+		new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+	if (qdq->d_fieldmask & QC_INO_HARD)
+		new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+				   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
+	bch2_btree_iter_unlock(&iter);
+
+	if (ret)
+		return ret;
+
+	ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+	return ret;
+}
+
+const struct quotactl_ops bch2_quotactl_operations = {
+	.quota_enable		= bch2_quota_enable,
+	.quota_disable		= bch2_quota_disable,
+	.rm_xquota		= bch2_quota_remove,
+
+	.get_state		= bch2_quota_get_state,
+	.set_info		= bch2_quota_set_info,
+
+	.get_dqblk		= bch2_get_quota,
+	.get_nextdqblk		= bch2_get_next_quota,
+	.set_dqblk		= bch2_set_quota,
+};
+
+#endif /* CONFIG_BCACHEFS_QUOTA */
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_H
+#define _BCACHEFS_QUOTA_H
+
+#include "inode.h"
+#include "quota_types.h"
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_quota_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_quota_invalid,		\
+	.val_to_text	= bch2_quota_to_text,		\
+}
+
+enum quota_acct_mode {
+	BCH_QUOTA_PREALLOC,
+	BCH_QUOTA_WARN,
+	BCH_QUOTA_NOCHECK,
+};
+
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
+{
+	return (struct bch_qid) {
+		.q[QTYP_USR] = u->bi_uid,
+		.q[QTYP_GRP] = u->bi_gid,
+		.q[QTYP_PRJ] = u->bi_project,
+	};
+}
+
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+	return ((c->opts.usrquota << QTYP_USR)|
+		(c->opts.grpquota << QTYP_GRP)|
+		(c->opts.prjquota << QTYP_PRJ));
+}
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
+		    s64, enum quota_acct_mode);
+
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
+			struct bch_qid, u64);
+
+void bch2_fs_quota_exit(struct bch_fs *);
+void bch2_fs_quota_init(struct bch_fs *);
+int bch2_fs_quota_read(struct bch_fs *);
+
+extern const struct quotactl_ops bch2_quotactl_operations;
+
+#else
+
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
+				  enum quota_counters counter, s64 v,
+				  enum quota_acct_mode mode)
+{
+	return 0;
+}
+
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
+				      struct bch_qid dst,
+				      struct bch_qid src, u64 space)
+{
+	return 0;
+}
+
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
+
+#endif
+
+#endif /* _BCACHEFS_QUOTA_H */
--- a/fs/bcachefs/quota_types.h
+++ b/fs/bcachefs/quota_types.h
@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_TYPES_H
+#define _BCACHEFS_QUOTA_TYPES_H
+
+#include <linux/generic-radix-tree.h>
+
+struct bch_qid {
+	u32		q[QTYP_NR];
+};
+
+struct memquota_counter {
+	u64				v;
+	u64				hardlimit;
+	u64				softlimit;
+	s64				timer;
+	int				warns;
+	int				warning_issued;
+};
+
+struct bch_memquota {
+	struct memquota_counter		c[Q_COUNTERS];
+};
+
+typedef GENRADIX(struct bch_memquota)	bch_memquota_table;
+
+struct quota_limit {
+	u32				timelimit;
+	u32				warnlimit;
+};
+
+struct bch_memquota_type {
+	struct quota_limit		limits[Q_COUNTERS];
+	bch_memquota_table		table;
+	struct mutex			lock;
+};
+
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+				      const struct bch_extent_ptr *ptr,
+				      struct bch_extent_crc_unpacked crc,
+				      struct bch_io_opts *io_opts)
+{
+	if (io_opts->background_target &&
+	    !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+	    !ptr->cached)
+		return true;
+
+	if (io_opts->background_compression &&
+	    crc.compression_type !=
+	    bch2_compression_opt_to_type[io_opts->background_compression])
+		return true;
+
+	return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+			    struct bkey_s_c k,
+			    struct bch_io_opts *io_opts)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	struct bkey_s_c_extent e;
+
+	if (!bkey_extent_is_data(k.k))
+		return;
+
+	if (!io_opts->background_target &&
+	    !io_opts->background_compression)
+		return;
+
+	e = bkey_s_c_to_extent(k);
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+			if (atomic64_add_return(crc.compressed_size,
+						&ca->rebalance_work) ==
+			    crc.compressed_size)
+				rebalance_wakeup(c);
+		}
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+	if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+	    sectors)
+		rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+				    enum bkey_type type,
+				    struct bkey_s_c_extent e,
+				    struct bch_io_opts *io_opts,
+				    struct data_opts *data_opts)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+
+	/* Make sure we have room to add a new pointer: */
+	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+	    BKEY_EXTENT_VAL_U64s_MAX)
+		return DATA_SKIP;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+			goto found;
+
+	return DATA_SKIP;
+found:
+	data_opts->target		= io_opts->background_target;
+	data_opts->btree_insert_flags	= 0;
+	return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+	int		dev_most_full_idx;
+	unsigned	dev_most_full_percent;
+	u64		dev_most_full_work;
+	u64		dev_most_full_capacity;
+	u64		total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+		u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+	unsigned percent_full;
+	u64 work = dev_work + unknown_dev;
+
+	if (work < dev_work || work < unknown_dev)
+		work = U64_MAX;
+	work = min(work, capacity);
+
+	percent_full = div_u64(work * 100, capacity);
+
+	if (percent_full >= w->dev_most_full_percent) {
+		w->dev_most_full_idx		= idx;
+		w->dev_most_full_percent	= percent_full;
+		w->dev_most_full_work		= work;
+		w->dev_most_full_capacity	= capacity;
+	}
+
+	if (w->total_work + dev_work >= w->total_work &&
+	    w->total_work + dev_work >= dev_work)
+		w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct rebalance_work ret = { .dev_most_full_idx = -1 };
+	u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		rebalance_work_accumulate(&ret,
+			atomic64_read(&ca->rebalance_work),
+			unknown_dev,
+			bucket_to_sector(ca, ca->mi.nbuckets -
+					 ca->mi.first_bucket),
+			i);
+
+	rebalance_work_accumulate(&ret,
+		unknown_dev, 0, c->capacity, -1);
+
+	return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i;
+
+	for_each_online_member(ca, c, i)
+		atomic64_set(&ca->rebalance_work, 0);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+	u64 utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+	struct bch_fs *c = arg;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct io_clock *clock = &c->io_clock[WRITE];
+	struct rebalance_work w, p;
+	unsigned long start, prev_start;
+	unsigned long prev_run_time, prev_run_cputime;
+	unsigned long cputime, prev_cputime;
+	unsigned long io_start;
+	long throttle;
+
+	set_freezable();
+
+	io_start	= atomic_long_read(&clock->now);
+	p		= rebalance_work(c);
+	prev_start	= jiffies;
+	prev_cputime	= curr_cputime();
+
+	while (!kthread_wait_freezable(r->enabled)) {
+		start			= jiffies;
+		cputime			= curr_cputime();
+
+		prev_run_time		= start - prev_start;
+		prev_run_cputime	= cputime - prev_cputime;
+
+		w			= rebalance_work(c);
+		BUG_ON(!w.dev_most_full_capacity);
+
+		if (!w.total_work) {
+			r->state = REBALANCE_WAITING;
+			kthread_wait_freezable(rebalance_work(c).total_work);
+			continue;
+		}
+
+		/*
+		 * If there isn't much work to do, throttle cpu usage:
+		 */
+		throttle = prev_run_cputime * 100 /
+			max(1U, w.dev_most_full_percent) -
+			prev_run_time;
+
+		if (w.dev_most_full_percent < 20 && throttle > 0) {
+			r->state = REBALANCE_THROTTLED;
+			r->throttled_until_iotime = io_start +
+				div_u64(w.dev_most_full_capacity *
+					(20 - w.dev_most_full_percent),
+					50);
+			r->throttled_until_cputime = start + throttle;
+
+			bch2_kthread_io_clock_wait(clock,
+				r->throttled_until_iotime,
+				throttle);
+			continue;
+		}
+
+		/* minimum 1 mb/sec: */
+		r->pd.rate.rate =
+			max_t(u64, 1 << 11,
+			      r->pd.rate.rate *
+			      max(p.dev_most_full_percent, 1U) /
+			      max(w.dev_most_full_percent, 1U));
+
+		io_start	= atomic_long_read(&clock->now);
+		p		= w;
+		prev_start	= start;
+		prev_cputime	= cputime;
+
+		r->state = REBALANCE_RUNNING;
+		memset(&r->move_stats, 0, sizeof(r->move_stats));
+		rebalance_work_reset(c);
+
+		bch2_move_data(c,
+			       /* ratelimiting disabled for now */
+			       NULL, /*  &r->pd.rate, */
+			       writepoint_ptr(&c->rebalance_write_point),
+			       POS_MIN, POS_MAX,
+			       rebalance_pred, NULL,
+			       &r->move_stats);
+	}
+
+	return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+	char *out = buf, *end = out + PAGE_SIZE;
+	struct bch_fs_rebalance *r = &c->rebalance;
+	struct rebalance_work w = rebalance_work(c);
+	char h1[21], h2[21];
+
+	bch2_hprint(h1, w.dev_most_full_work << 9);
+	bch2_hprint(h2, w.dev_most_full_capacity << 9);
+	out += scnprintf(out, end - out,
+			 "fullest_dev (%i):\t%s/%s\n",
+			 w.dev_most_full_idx, h1, h2);
+
+	bch2_hprint(h1, w.total_work << 9);
+	bch2_hprint(h2, c->capacity << 9);
+	out += scnprintf(out, end - out,
+			 "total work:\t\t%s/%s\n",
+			 h1, h2);
+
+	out += scnprintf(out, end - out,
+			 "rate:\t\t\t%u\n",
+			 r->pd.rate.rate);
+
+	switch (r->state) {
+	case REBALANCE_WAITING:
+		out += scnprintf(out, end - out, "waiting\n");
+		break;
+	case REBALANCE_THROTTLED:
+		bch2_hprint(h1,
+			    (r->throttled_until_iotime -
+			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+		out += scnprintf(out, end - out,
+				 "throttled for %lu sec or %s io\n",
+				 (r->throttled_until_cputime - jiffies) / HZ,
+				 h1);
+		break;
+	case REBALANCE_RUNNING:
+		out += scnprintf(out, end - out, "running\n");
+		out += scnprintf(out, end - out, "pos %llu:%llu\n",
+				 r->move_stats.iter.pos.inode,
+				 r->move_stats.iter.pos.offset);
+		break;
+	}
+
+	return out - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	c->rebalance.pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+	p = rcu_dereference_protected(c->rebalance.thread, 1);
+	c->rebalance.thread = NULL;
+
+	if (p) {
+		/* for sychronizing with rebalance_wakeup() */
+		synchronize_rcu();
+
+		kthread_stop(p);
+		put_task_struct(p);
+	}
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	if (c->opts.nochanges)
+		return 0;
+
+	p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	get_task_struct(p);
+	rcu_assign_pointer(c->rebalance.thread, p);
+	wake_up_process(p);
+	return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+	bch2_pd_controller_init(&c->rebalance.pd);
+
+	atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
+
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+	struct task_struct *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(c->rebalance.thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+			    struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_REBALANCE_H */
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+	REBALANCE_WAITING,
+	REBALANCE_THROTTLED,
+	REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+	struct task_struct __rcu *thread;
+	struct bch_pd_controller pd;
+
+	atomic64_t		work_unknown_dev;
+
+	enum rebalance_state	state;
+	unsigned long		throttled_until_iotime;
+	unsigned long		throttled_until_cputime;
+	struct bch_move_stats	move_stats;
+
+	unsigned		enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "dirent.h"
+#include "error.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super-io.h"
+
+#include <linux/stat.h>
+
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+struct bkey_i *btree_root_find(struct bch_fs *c,
+			       struct bch_sb_field_clean *clean,
+			       struct jset *j,
+			       enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry, *start, *end;
+
+	if (clean) {
+		start = clean->start;
+		end = vstruct_end(&clean->field);
+	} else {
+		start = j->start;
+		end = vstruct_last(j);
+	}
+
+	for (entry = start; entry < end; entry = vstruct_next(entry))
+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
+		    entry->btree_id == id)
+			goto found;
+
+	return NULL;
+found:
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+				   struct bch_sb_field_clean *clean,
+				   struct jset *j)
+{
+	unsigned i;
+	int ret = 0;
+
+	if (!clean || !j)
+		return 0;
+
+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+			le64_to_cpu(clean->journal_seq),
+			le64_to_cpu(j->seq)))
+		bch2_fs_mark_clean(c, false);
+
+	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+			"superblock read clock doesn't match journal after clean shutdown");
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct bkey_i *k1, *k2;
+		unsigned l1 = 0, l2 = 0;
+
+		k1 = btree_root_find(c, clean, NULL, i, &l1);
+		k2 = btree_root_find(c, NULL, j, i, &l2);
+
+		if (!k1 && !k2)
+			continue;
+
+		mustfix_fsck_err_on(!k1 || !k2 ||
+				    IS_ERR(k1) ||
+				    IS_ERR(k2) ||
+				    k1->k.u64s != k2->k.u64s ||
+				    memcmp(k1, k2, bkey_bytes(k1)) ||
+				    l1 != l2, c,
+			"superblock btree root doesn't match journal after clean shutdown");
+	}
+fsck_err:
+	return ret;
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+	struct journal_replay *i;
+	struct jset_entry *entry;
+
+	if (list_empty(journal))
+		return true;
+
+	i = list_last_entry(journal, struct journal_replay, list);
+
+	if (i->j.last_seq != i->j.seq)
+		return false;
+
+	list_for_each_entry(i, journal, list) {
+		vstruct_for_each(&i->j, entry) {
+			if (entry->type == BCH_JSET_ENTRY_btree_root)
+				continue;
+
+			if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+			    !entry->u64s)
+				continue;
+			return false;
+		}
+	}
+
+	return true;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+	const char *err = "cannot allocate memory";
+	struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
+	LIST_HEAD(journal);
+	struct jset *j = NULL;
+	unsigned i;
+	int ret;
+
+	mutex_lock(&c->sb_lock);
+	if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+		bch_info(c, "building replicas info");
+		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+	}
+
+	if (c->sb.clean)
+		sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+	if (sb_clean) {
+		clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+				GFP_KERNEL);
+		if (!clean) {
+			ret = -ENOMEM;
+			mutex_unlock(&c->sb_lock);
+			goto err;
+		}
+	}
+	mutex_unlock(&c->sb_lock);
+
+	if (clean)
+		bch_info(c, "recovering from clean shutdown, journal seq %llu",
+			 le64_to_cpu(clean->journal_seq));
+
+	if (!clean || !c->opts.nofsck) {
+		ret = bch2_journal_read(c, &journal);
+		if (ret)
+			goto err;
+
+		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+	} else {
+		ret = bch2_journal_set_seq(c,
+					   le64_to_cpu(clean->journal_seq),
+					   le64_to_cpu(clean->journal_seq));
+		BUG_ON(ret);
+	}
+
+	ret = verify_superblock_clean(c, clean, j);
+	if (ret)
+		goto err;
+
+	fsck_err_on(clean && !journal_empty(&journal), c,
+		    "filesystem marked clean but journal not empty");
+
+	if (clean) {
+		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+	} else {
+		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		unsigned level;
+		struct bkey_i *k;
+
+		k = btree_root_find(c, clean, j, i, &level);
+		if (!k)
+			continue;
+
+		err = "invalid btree root pointer";
+		if (IS_ERR(k))
+			goto err;
+
+		err = "error reading btree root";
+		if (bch2_btree_root_read(c, i, k, level)) {
+			if (i != BTREE_ID_ALLOC)
+				goto err;
+
+			mustfix_fsck_err(c, "error reading btree root");
+		}
+	}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		if (!c->btree_roots[i].b)
+			bch2_btree_root_alloc(c, i);
+
+	err = "error reading allocation information";
+	ret = bch2_alloc_read(c, &journal);
+	if (ret)
+		goto err;
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+	bch_verbose(c, "starting mark and sweep:");
+	err = "error in recovery";
+	ret = bch2_initial_gc(c, &journal);
+	if (ret)
+		goto err;
+	bch_verbose(c, "mark and sweep done");
+
+	if (c->opts.noreplay)
+		goto out;
+
+	/*
+	 * Mark dirty before journal replay, fsck:
+	 * XXX: after a clean shutdown, this could be done lazily only when fsck
+	 * finds an error
+	 */
+	bch2_fs_mark_clean(c, false);
+
+	/*
+	 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
+	 * will give spurious errors about oldest_gen > bucket_gen -
+	 * this is a hack but oh well.
+	 */
+	bch2_fs_journal_start(&c->journal);
+
+	err = "error starting allocator";
+	ret = bch2_fs_allocator_start(c);
+	if (ret)
+		goto err;
+
+	bch_verbose(c, "starting journal replay:");
+	err = "journal replay failed";
+	ret = bch2_journal_replay(c, &journal);
+	if (ret)
+		goto err;
+	bch_verbose(c, "journal replay done");
+
+	if (c->opts.norecovery)
+		goto out;
+
+	err = "error in fsck";
+	ret = bch2_fsck(c);
+	if (ret)
+		goto err;
+
+	if (enabled_qtypes(c)) {
+		bch_verbose(c, "reading quotas:");
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+		bch_verbose(c, "quotas done");
+	}
+
+out:
+	bch2_journal_entries_free(&journal);
+	kfree(clean);
+	return ret;
+err:
+fsck_err:
+	BUG_ON(!ret);
+	goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+	struct bch_inode_unpacked root_inode, lostfound_inode;
+	struct bkey_inode_buf packed_inode;
+	struct bch_hash_info root_hash_info;
+	struct qstr lostfound = QSTR("lost+found");
+	const char *err = "cannot allocate memory";
+	struct bch_dev *ca;
+	LIST_HEAD(journal);
+	unsigned i;
+	int ret;
+
+	bch_notice(c, "initializing new filesystem");
+
+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+	ret = bch2_initial_gc(c, &journal);
+	if (ret)
+		goto err;
+
+	err = "unable to allocate journal buckets";
+	for_each_online_member(ca, c, i)
+		if (bch2_dev_journal_alloc(ca)) {
+			percpu_ref_put(&ca->io_ref);
+			goto err;
+		}
+
+	for (i = 0; i < BTREE_ID_NR; i++)
+		bch2_btree_root_alloc(c, i);
+
+	/*
+	 * journal_res_get() will crash if called before this has
+	 * set up the journal.pin FIFO and journal.cur pointer:
+	 */
+	bch2_fs_journal_start(&c->journal);
+	bch2_journal_set_replay_done(&c->journal);
+
+	err = "error starting allocator";
+	ret = bch2_fs_allocator_start(c);
+	if (ret)
+		goto err;
+
+	bch2_inode_init(c, &root_inode, 0, 0,
+			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+	root_inode.bi_inum = BCACHEFS_ROOT_INO;
+	root_inode.bi_nlink++; /* lost+found */
+	bch2_inode_pack(&packed_inode, &root_inode);
+
+	err = "error creating root directory";
+	ret = bch2_btree_insert(c, BTREE_ID_INODES,
+				&packed_inode.inode.k_i,
+				NULL, NULL, NULL, 0);
+	if (ret)
+		goto err;
+
+	bch2_inode_init(c, &lostfound_inode, 0, 0,
+			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
+			&root_inode);
+	lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
+	bch2_inode_pack(&packed_inode, &lostfound_inode);
+
+	err = "error creating lost+found";
+	ret = bch2_btree_insert(c, BTREE_ID_INODES,
+				&packed_inode.inode.k_i,
+				NULL, NULL, NULL, 0);
+	if (ret)
+		goto err;
+
+	root_hash_info = bch2_hash_info_init(c, &root_inode);
+
+	ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
+				 &lostfound, lostfound_inode.bi_inum, NULL,
+				 BTREE_INSERT_NOFAIL);
+	if (ret)
+		goto err;
+
+	atomic_long_set(&c->nr_inodes, 2);
+
+	if (enabled_qtypes(c)) {
+		ret = bch2_fs_quota_read(c);
+		if (ret)
+			goto err;
+	}
+
+	err = "error writing first journal entry";
+	ret = bch2_journal_meta(&c->journal);
+	if (ret)
+		goto err;
+
+	mutex_lock(&c->sb_lock);
+	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+err:
+	BUG_ON(!ret);
+	return ret;
+}
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@ -0,0 +1,698 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+				     unsigned dev)
+{
+	return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+				    unsigned dev)
+{
+	e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+	return (r->entry_size -
+		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
+			      char *buf, size_t size)
+{
+	char *out = buf, *end = out + size;
+	struct bch_replicas_cpu_entry *e;
+	bool first = true;
+	unsigned i;
+
+	for_each_cpu_replicas_entry(r, e) {
+		bool first_e = true;
+
+		if (!first)
+			out += scnprintf(out, end - out, " ");
+		first = false;
+
+		out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i)) {
+				if (!first_e)
+					out += scnprintf(out, end - out, " ");
+				first_e = false;
+				out += scnprintf(out, end - out, "%u", i);
+			}
+		out += scnprintf(out, end - out, "]");
+	}
+
+	return out - buf;
+}
+
+static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+					enum bch_data_type data_type,
+					struct bch_replicas_cpu_entry *r,
+					unsigned *max_dev)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr = 0;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached) {
+			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
+			replicas_set_dev(r, ptr->dev);
+			nr++;
+		}
+	return nr;
+}
+
+static inline void devlist_to_replicas(struct bch_devs_list devs,
+				       enum bch_data_type data_type,
+				       struct bch_replicas_cpu_entry *r,
+				       unsigned *max_dev)
+{
+	unsigned i;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	for (i = 0; i < devs.nr; i++) {
+		*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
+		replicas_set_dev(r, devs.devs[i]);
+	}
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_cpu_entry new_entry,
+		       unsigned max_dev)
+{
+	struct bch_replicas_cpu *new;
+	unsigned i, nr, entry_size;
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+	entry_size = max(entry_size, old->entry_size);
+	nr = old->nr + 1;
+
+	new = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      nr * entry_size, GFP_NOIO);
+	if (!new)
+		return NULL;
+
+	new->nr		= nr;
+	new->entry_size	= entry_size;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(new, i),
+		       cpu_replicas_entry(old, i),
+		       min(new->entry_size, old->entry_size));
+
+	memcpy(cpu_replicas_entry(new, old->nr),
+	       &new_entry,
+	       new->entry_size);
+
+	bch2_cpu_replicas_sort(new);
+	return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	return max_dev < replicas_dev_slots(r) &&
+		eytzinger0_find(r->entries, r->nr,
+				r->entry_size,
+				memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+				struct bch_replicas_cpu_entry new_entry,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+	int ret = -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+
+	old_gc = rcu_dereference_protected(c->replicas_gc,
+					   lockdep_is_held(&c->sb_lock));
+	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+		if (!new_gc)
+			goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+	if (!replicas_has_entry(old_r, new_entry, max_dev)) {
+		new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+		if (!new_r)
+			goto err;
+
+		ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+		if (ret)
+			goto err;
+	}
+
+	/* allocations done, now commit: */
+
+	if (new_r)
+		bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+
+	if (new_gc) {
+		rcu_assign_pointer(c->replicas_gc, new_gc);
+		kfree_rcu(old_gc, rcu);
+	}
+
+	if (new_r) {
+		rcu_assign_pointer(c->replicas, new_r);
+		kfree_rcu(old_r, rcu);
+	}
+
+	mutex_unlock(&c->sb_lock);
+	return 0;
+err:
+	mutex_unlock(&c->sb_lock);
+	kfree(new_gc);
+	kfree(new_r);
+	return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+		       enum bch_data_type data_type,
+		       struct bch_devs_list devs)
+{
+	struct bch_replicas_cpu_entry search;
+	struct bch_replicas_cpu *r, *gc_r;
+	unsigned max_dev;
+	bool marked;
+
+	if (!devs.nr)
+		return 0;
+
+	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	gc_r = rcu_dereference(c->replicas_gc);
+	marked = replicas_has_entry(r, search, max_dev) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+	rcu_read_unlock();
+
+	return likely(marked) ? 0
+		: bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+			    enum bch_data_type data_type,
+			    struct bkey_s_c k)
+{
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < cached.nr; i++)
+		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+					      bch2_dev_list_single(cached.devs[i]))))
+			return ret;
+
+	return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+{
+	struct bch_replicas_cpu *new_r, *old_r;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+
+	new_r = rcu_dereference_protected(c->replicas_gc,
+					  lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->replicas_gc, NULL);
+
+	if (ret)
+		goto err;
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+
+	rcu_assign_pointer(c->replicas, new_r);
+	kfree_rcu(old_r, rcu);
+out:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+err:
+	kfree_rcu(new_r, rcu);
+	goto out;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_cpu *dst, *src;
+	struct bch_replicas_cpu_entry *e;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc);
+
+	src = rcu_dereference_protected(c->replicas,
+					lockdep_is_held(&c->sb_lock));
+
+	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      src->nr * src->entry_size, GFP_NOIO);
+	if (!dst) {
+		mutex_unlock(&c->sb_lock);
+		return -ENOMEM;
+	}
+
+	dst->nr		= 0;
+	dst->entry_size	= src->entry_size;
+
+	for_each_cpu_replicas_entry(src, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(dst, dst->nr++),
+			       e, dst->entry_size);
+
+	bch2_cpu_replicas_sort(dst);
+
+	rcu_assign_pointer(c->replicas_gc, dst);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+					unsigned *nr,
+					unsigned *bytes,
+					unsigned *max_dev)
+{
+	struct bch_replicas_entry *i;
+	unsigned j;
+
+	*nr	= 0;
+	*bytes	= sizeof(*r);
+	*max_dev = 0;
+
+	if (!r)
+		return;
+
+	for_each_replicas_entry(r, i) {
+		for (j = 0; j < i->nr; j++)
+			*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+		(*nr)++;
+	}
+
+	*bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+	struct bch_replicas_cpu *cpu_r;
+	unsigned i, nr, bytes, max_dev, entry_size;
+
+	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+
+	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+			nr * entry_size, GFP_NOIO);
+	if (!cpu_r)
+		return NULL;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	if (nr) {
+		struct bch_replicas_cpu_entry *dst =
+			cpu_replicas_entry(cpu_r, 0);
+		struct bch_replicas_entry *src = sb_r->entries;
+
+		while (dst < cpu_replicas_entry(cpu_r, nr)) {
+			dst->data_type = src->data_type;
+			for (i = 0; i < src->nr; i++)
+				replicas_set_dev(dst, src->devs[i]);
+
+			src	= replicas_entry_next(src);
+			dst	= (void *) dst + entry_size;
+		}
+	}
+
+	bch2_cpu_replicas_sort(cpu_r);
+	return cpu_r;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_cpu *cpu_r, *old_r;
+
+	sb_r	= bch2_sb_get_replicas(c->disk_sb.sb);
+	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		return -ENOMEM;
+
+	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->replicas, cpu_r);
+	if (old_r)
+		kfree_rcu(old_r, rcu);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *sb_e;
+	struct bch_replicas_cpu_entry *e;
+	size_t i, bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, e) {
+		bytes += sizeof(struct bch_replicas_entry);
+		for (i = 0; i < r->entry_size - 1; i++)
+			bytes += hweight8(e->devs[i]);
+	}
+
+	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	sb_e = sb_r->entries;
+	for_each_cpu_replicas_entry(r, e) {
+		sb_e->data_type = e->data_type;
+
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i))
+				sb_e->devs[sb_e->nr++] = i;
+
+		sb_e = replicas_entry_next(sb_e);
+
+		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_entry *e;
+	const char *err;
+	unsigned i;
+
+	for_each_replicas_entry(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr)
+			goto err;
+
+		err = "invalid replicas entry: too many devices";
+		if (e->nr >= BCH_REPLICAS_MAX)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		goto err;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_cpu_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_cpu_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
+
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+		err = "duplicate replicas entry";
+		if (!memcmp(l, r, cpu_r->entry_size))
+			goto err;
+	}
+
+	err = NULL;
+err:
+	kfree(cpu_r);
+	return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_validate_replicas,
+};
+
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+{
+	char *out = buf, *end = out + size;
+	struct bch_replicas_entry *e;
+	bool first = true;
+	unsigned i;
+
+	if (!r) {
+		out += scnprintf(out, end - out, "(no replicas section found)");
+		return out - buf;
+	}
+
+	for_each_replicas_entry(r, e) {
+		if (!first)
+			out += scnprintf(out, end - out, " ");
+		first = false;
+
+		out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+		for (i = 0; i < e->nr; i++)
+			out += scnprintf(out, end - out,
+					 i ? " %u" : "%u", e->devs[i]);
+		out += scnprintf(out, end - out, "]");
+	}
+
+	return out - buf;
+}
+
+/* Query replicas: */
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  enum bch_data_type data_type,
+			  struct bch_devs_list devs)
+{
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+	bool ret;
+
+	if (!devs.nr)
+		return true;
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+	rcu_read_lock();
+	ret = replicas_has_entry(rcu_dereference(c->replicas),
+				 search, max_dev);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       enum bch_data_type data_type,
+			       struct bkey_s_c k)
+{
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+
+	for (i = 0; i < cached.nr; i++)
+		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+					  bch2_dev_list_single(cached.devs[i])))
+			return false;
+
+	return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+					      struct bch_devs_mask online_devs)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_cpu *r;
+	unsigned i, dev, dev_slots, nr_online, nr_offline;
+	struct replicas_status ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+		ret.replicas[i].nr_online = UINT_MAX;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	rcu_read_lock();
+
+	r = rcu_dereference(c->replicas);
+	dev_slots = replicas_dev_slots(r);
+
+	for_each_cpu_replicas_entry(r, e) {
+		if (e->data_type >= ARRAY_SIZE(ret.replicas))
+			panic("e %p data_type %u\n", e, e->data_type);
+
+		nr_online = nr_offline = 0;
+
+		for (dev = 0; dev < dev_slots; dev++) {
+			if (!replicas_test_dev(e, dev))
+				continue;
+
+			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+
+			if (test_bit(dev, online_devs.d))
+				nr_online++;
+			else
+				nr_offline++;
+		}
+
+		ret.replicas[e->data_type].nr_online =
+			min(ret.replicas[e->data_type].nr_online,
+			    nr_online);
+
+		ret.replicas[e->data_type].nr_offline =
+			max(ret.replicas[e->data_type].nr_offline,
+			    nr_offline);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+	return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+			     enum bch_data_type type,
+			     bool force_if_degraded,
+			     bool force_if_lost)
+{
+	return (!s.replicas[type].nr_offline || force_if_degraded) &&
+		(s.replicas[type].nr_online || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+	return (have_enough_devs(s, BCH_DATA_JOURNAL,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_BTREE,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_USER,
+				 flags & BCH_FORCE_IF_DATA_DEGRADED,
+				 flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+	struct replicas_status s = bch2_replicas_status(c);
+
+	return meta
+		? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+		      s.replicas[BCH_DATA_BTREE].nr_online)
+		: s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_cpu *r;
+	unsigned ret = 0;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+
+	if (ca->dev_idx >= replicas_dev_slots(r))
+		goto out;
+
+	for_each_cpu_replicas_entry(r, e)
+		if (replicas_test_dev(e, ca->dev_idx))
+			ret |= 1 << e->data_type;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
--- a/Show More
+++ b/Show More