From 66927b89289974dab6d3b3cdd7706d0376034114 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Aug 2024 15:11:38 -0400
Subject: [PATCH 01/61] bcachefs: Fix failure to return error in
 data_update_index_update()

This fixes an assertion pop in io_write.c - if we don't return an error
we're supposed to have completed all the btree updates.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 65176d51b502..004894ad4147 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -337,6 +337,7 @@ restart_drop_extra_replicas:
 			printbuf_exit(&buf);
 
 			bch2_fatal_error(c);
+			ret = -EIO;
 			goto out;
 		}
 

From d26935690c03fe8159d42358bed1c56252700cd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Aug 2024 19:11:00 -0400
Subject: [PATCH 02/61] bcachefs: Fix bch2_extents_match() false positive

This was caught as a very rare nonce inconsistency, on systems with
encryption and replication (and tiering, or some form of rebalance
operation running):

[Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path
[Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd
[Wed Jul 17 13:30:03 2024] k:   u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd
[Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0
[Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only

bch2_extents_match() was reporting true for extents that did not
actually point to the same data.

bch2_extent_match() iterates over pairs of pointers, looking for
pointers that point to the same location on disk (with matching
generation numbers). However one or both extents may have been trimmed
(or merged) and they might not have the same disk offset: it corrects
for this by subtracting the key offset and the checksum entry offset.

However, this failed when an extent was immediately partially
overwritten, and the new overwrite was allocated the next adjacent disk
space.

Normally, with compression off, this would never cause a bug, since the
new extent would have to be immediately after the old extent for the
pointer offsets to match, and the rebalance index update path is not
looking for an extent outside the range of the extent it moved.

However with compression enabled, extents take up less space on disk
than they do in the btree index space - and spuriously matching after
partial overwrite is possible.

To fix this, add a secondary check, that strictly checks that the
regions pointed to on disk overlap.

https://github.com/koverstreet/bcachefs/issues/717

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e317df3644a1..eb31bda19544 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
 				if (p1.ptr.dev		== p2.ptr.dev &&
 				    p1.ptr.gen		== p2.ptr.gen &&
+
+				    /*
+				     * This checks that the two pointers point
+				     * to the same region on disk - adjusting
+				     * for the difference in where the extents
+				     * start, since one may have been trimmed:
+				     */
 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+
+				    /*
+				     * This additionally checks that the
+				     * extents overlap on disk, since the
+				     * previous check may trigger spuriously
+				     * when one extent is immediately partially
+				     * overwritten with another extent (so that
+				     * on disk they are adjacent) and
+				     * compression is in use:
+				     */
+				    ((p1.ptr.offset >= p2.ptr.offset &&
+				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
+				     (p2.ptr.offset >= p1.ptr.offset &&
+				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
 					return true;
 
 		return false;

From b1b6e3a9c9f73a0bb941fd7fb7b8470992d2a3a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Aug 2024 11:18:21 -0400
Subject: [PATCH 03/61] inode: make __iget() a static inline

bcachefs is switching to an rhashtable for vfs inodes instead of the
standard inode.c hashtable, so we need this exported, or - a static
inline makes more sense for a single atomic_inc().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/inode.c         | 8 --------
 include/linux/fs.h | 9 ++++++++-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..5e7dcdeedd4d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -438,14 +438,6 @@ static void init_once(void *foo)
 	inode_init_once(inode);
 }
 
-/*
- * inode->i_lock must be held
- */
-void __iget(struct inode *inode)
-{
-	atomic_inc(&inode->i_count);
-}
-
 /*
  * get additional reference to inode; caller must already hold one.
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..8fc4bad3b6ae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3094,7 +3094,14 @@ static inline bool is_zero_ino(ino_t ino)
 	return (u32)ino == 0;
 }
 
-extern void __iget(struct inode * inode);
+/*
+ * inode->i_lock must be held
+ */
+static inline void __iget(struct inode *inode)
+{
+	atomic_inc(&inode->i_count);
+}
+
 extern void iget_failed(struct inode *);
 extern void clear_inode(struct inode *);
 extern void __destroy_inode(struct inode *);

From e5a638f4d57e98d10c5ba99d307a21a5b2b6680c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 21:41:01 -0400
Subject: [PATCH 04/61] bcachefs: switch to rhashtable for vfs inodes hash

the standard vfs inode hash table suffers from painful lock contention -
this is long overdue

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c             |   2 +-
 fs/bcachefs/bcachefs.h        |   1 +
 fs/bcachefs/fs-io-buffered.c  |   2 +-
 fs/bcachefs/fs-io-direct.c    |   2 +-
 fs/bcachefs/fs-io.c           |   6 +-
 fs/bcachefs/fs-ioctl.c        |   4 +-
 fs/bcachefs/fs.c              | 188 ++++++++++++++++++++++------------
 fs/bcachefs/fs.h              |  18 ++--
 fs/bcachefs/inode.c           |   2 +-
 fs/bcachefs/subvolume_types.h |   3 +-
 fs/bcachefs/super.c           |   2 +
 fs/bcachefs/xattr.c           |   2 +-
 12 files changed, 148 insertions(+), 84 deletions(-)

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 331a17f3f113..87f1be9d4db4 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -361,7 +361,7 @@ retry:
 	bch2_trans_begin(trans);
 	acl = _acl;
 
-	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
+	ret   = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
 		bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
 			      BTREE_ITER_intent);
 	if (ret)
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 0c7086e00d18..3459b71206a7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1023,6 +1023,7 @@ struct bch_fs {
 	/* fs.c */
 	struct list_head	vfs_inodes_list;
 	struct mutex		vfs_inodes_lock;
+	struct rhashtable	vfs_inodes_table;
 
 	/* VFS IO PATH - fs-io.c */
 	struct bio_set		writepage_bioset;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 184d03851676..975a63063629 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -486,7 +486,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 	op->nr_replicas		= nr_replicas;
 	op->res.nr_replicas	= nr_replicas;
 	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
-	op->subvol		= inode->ei_subvol;
+	op->subvol		= inode->ei_inum.subvol;
 	op->pos			= POS(inode->v.i_ino, sector);
 	op->end_io		= bch2_writepage_io_done;
 	op->devs_need_flush	= &inode->ei_devs_need_flush;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index e246b1e05aa2..ee1c0325f313 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
 		dio->op.target		= dio->op.opts.foreground_target;
 		dio->op.write_point	= writepoint_hashed((unsigned long) current);
 		dio->op.nr_replicas	= dio->op.opts.data_replicas;
-		dio->op.subvol		= inode->ei_subvol;
+		dio->op.subvol		= inode->ei_inum.subvol;
 		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 		dio->op.devs_need_flush	= &inode->ei_devs_need_flush;
 
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 77b85da30fb2..a5018cbdb376 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -267,7 +267,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 		 * XXX: we're doing two index lookups when we end up reading the
 		 * folio
 		 */
-		ret = range_has_data(c, inode->ei_subvol,
+		ret = range_has_data(c, inode->ei_inum.subvol,
 				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
 				POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
 		if (ret <= 0)
@@ -618,7 +618,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 		bch2_trans_begin(trans);
 
 		ret = bch2_subvolume_get_snapshot(trans,
-					inode->ei_subvol, &snapshot);
+					inode->ei_inum.subvol, &snapshot);
 		if (ret)
 			goto bkey_err;
 
@@ -823,7 +823,7 @@ static int quota_reserve_range(struct bch_inode_info *inode,
 retry:
 	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 99c7fe987c74..405cf08bda34 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
 		bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
 			       ATTR_CTIME);
 	mutex_unlock(&inode->ei_update_lock);
@@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
 	}
 
 	mutex_lock(&inode->ei_update_lock);
-	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
 		bch2_set_projid(c, inode, fa.fsx_projid) ?:
 		bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
 			       ATTR_CTIME);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 94c392abef65..85b697041f32 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,7 @@ retry:
 		goto retry;
 
 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-			     "%s: inode %u:%llu not found when updating",
+			     "%s: inode %llu:%llu not found when updating",
 			     bch2_err_str(ret),
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);
@@ -152,42 +152,91 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
 	return ret;
 }
 
-static int bch2_iget5_test(struct inode *vinode, void *p)
+static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
 {
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	subvol_inum *inum = p;
-
-	return inode->ei_subvol == inum->subvol &&
-		inode->ei_inode.bi_inum == inum->inum;
+	return a.subvol == b.subvol && a.inum == b.inum;
 }
 
-static int bch2_iget5_set(struct inode *vinode, void *p)
+static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
+				 const void *obj)
 {
-	struct bch_inode_info *inode = to_bch_ei(vinode);
-	subvol_inum *inum = p;
+	const struct bch_inode_info *inode = obj;
+	const subvol_inum *v = arg->key;
 
-	inode->v.i_ino		= inum->inum;
-	inode->ei_subvol	= inum->subvol;
-	inode->ei_inode.bi_inum	= inum->inum;
-	return 0;
+	return !subvol_inum_eq(inode->ei_inum, *v);
 }
 
-static unsigned bch2_inode_hash(subvol_inum inum)
+static const struct rhashtable_params bch2_vfs_inodes_params = {
+	.head_offset		= offsetof(struct bch_inode_info, hash),
+	.key_offset		= offsetof(struct bch_inode_info, ei_inum),
+	.key_len		= sizeof(subvol_inum),
+	.obj_cmpfn		= bch2_vfs_inode_cmp_fn,
+	.automatic_shrinking	= true,
+};
+
+static void __wait_on_freeing_inode(struct inode *inode)
 {
-	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+	wait_queue_head_t *wq;
+	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+	wq = bit_waitqueue(&inode->i_state, __I_NEW);
+	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->i_lock);
+	schedule();
+	finish_wait(wq, &wait.wq_entry);
 }
 
-static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
 {
-	subvol_inum inum = inode_inum(inode);
-	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
-				      bch2_inode_hash(inum),
-				      bch2_iget5_test,
-				      bch2_iget5_set,
-				      &inum));
-	BUG_ON(!old);
+	struct bch_inode_info *inode;
+repeat:
+	inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum,
+				       bch2_vfs_inodes_params);
+	if (inode) {
+		spin_lock(&inode->v.i_lock);
+		if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
+			spin_unlock(&inode->v.i_lock);
+			return NULL;
+		}
+		if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
+			__wait_on_freeing_inode(&inode->v);
+			goto repeat;
+		}
+		__iget(&inode->v);
+		spin_unlock(&inode->v.i_lock);
+	}
+
+	return inode;
+}
+
+static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	spin_lock(&inode->v.i_lock);
+	bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
+	spin_unlock(&inode->v.i_lock);
+
+	if (remove) {
+		int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
+					&inode->hash, bch2_vfs_inodes_params);
+		BUG_ON(ret);
+		inode->v.i_hash.pprev = NULL;
+	}
+}
+
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct bch_inode_info *inode)
+{
+	struct bch_inode_info *old = inode;
+
+	set_bit(EI_INODE_HASHED, &inode->ei_flags);
+retry:
+	if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+					&inode->hash,
+					bch2_vfs_inodes_params))) {
+		old = bch2_inode_hash_find(c, inode->ei_inum);
+		if (!old)
+			goto retry;
+
+		clear_bit(EI_INODE_HASHED, &inode->ei_flags);
 
-	if (unlikely(old != inode)) {
 		/*
 		 * bcachefs doesn't use I_NEW; we have no use for it since we
 		 * only insert fully created inodes in the inode hash table. But
@@ -203,16 +252,13 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
 		discard_new_inode(&inode->v);
 		inode = old;
 	} else {
+		inode_fake_hash(&inode->v);
+
+		inode_sb_list_add(&inode->v);
+
 		mutex_lock(&c->vfs_inodes_lock);
 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
 		mutex_unlock(&c->vfs_inodes_lock);
-		/*
-		 * Again, I_NEW makes no sense for bcachefs. This is only needed
-		 * for clearing I_NEW, but since the inode was already fully
-		 * created and initialized we didn't actually want
-		 * inode_insert5() to set it for us.
-		 */
-		unlock_new_inode(&inode->v);
 	}
 
 	return inode;
@@ -277,11 +323,7 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
 
 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
-	struct bch_inode_info *inode =
-		to_bch_ei(ilookup5_nowait(c->vfs_sb,
-					  bch2_inode_hash(inum),
-					  bch2_iget5_test,
-					  &inum));
+	struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
 	if (inode)
 		return &inode->v;
 
@@ -295,7 +337,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
 	if (!ret) {
 		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-		inode = bch2_inode_insert(c, inode);
+		inode = bch2_inode_hash_insert(c, inode);
 	}
 	bch2_trans_put(trans);
 
@@ -343,7 +385,7 @@ __bch2_create(struct mnt_idmap *idmap,
 retry:
 	bch2_trans_begin(trans);
 
-	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
+	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
 		bch2_create_trans(trans,
 				  inode_inum(dir), &dir_u, &inode_u,
 				  !(flags & BCH_CREATE_TMPFILE)
@@ -357,7 +399,7 @@ retry:
 	if (unlikely(ret))
 		goto err_before_quota;
 
-	inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+	inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
 	inum.inum = inode_u.bi_inum;
 
 	ret   = bch2_subvolume_get(trans, inum.subvol, true,
@@ -388,7 +430,7 @@ err_before_quota:
 	 * bch2_trans_exit() and dropping locks, else we could race with another
 	 * thread pulling the inode in and modifying it:
 	 */
-	inode = bch2_inode_insert(c, inode);
+	inode = bch2_inode_hash_insert(c, inode);
 	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
@@ -428,11 +470,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	struct bch_inode_info *inode =
-		to_bch_ei(ilookup5_nowait(c->vfs_sb,
-					  bch2_inode_hash(inum),
-					  bch2_iget5_test,
-					  &inum));
+	struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
 	if (inode)
 		goto out;
 
@@ -462,7 +500,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 	}
 
 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-	inode = bch2_inode_insert(c, inode);
+	inode = bch2_inode_hash_insert(c, inode);
 out:
 	bch2_trans_iter_exit(trans, &dirent_iter);
 	printbuf_exit(&buf);
@@ -549,8 +587,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
-	ret   = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
-		bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+	ret   = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
+		bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
 		__bch2_link(c, inode, dir, dentry);
 	if (unlikely(ret))
 		return bch2_err_class(ret);
@@ -606,7 +644,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
 	struct bch_inode_info *dir= to_bch_ei(vdir);
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 
-	int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+	int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
 		__bch2_unlink(vdir, dentry, false);
 	return bch2_err_class(ret);
 }
@@ -689,8 +727,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
 
 	trans = bch2_trans_get(c);
 
-	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
-		bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
+	ret   = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
+		bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
 	if (ret)
 		goto err;
 
@@ -891,7 +929,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
 	stat->blksize	= block_bytes(c);
 	stat->blocks	= inode->v.i_blocks;
 
-	stat->subvol	= inode->ei_subvol;
+	stat->subvol	= inode->ei_inum.subvol;
 	stat->result_mask |= STATX_SUBVOL;
 
 	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@@ -933,7 +971,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
 
 	lockdep_assert_held(&inode->v.i_rwsem);
 
-	ret   = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
+	ret   = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
 		setattr_prepare(idmap, dentry, iattr);
 	if (ret)
 		return ret;
@@ -1045,7 +1083,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 retry:
 	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
@@ -1165,7 +1203,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
 		struct bch_inode_info *inode = to_bch_ei(vinode);
 		struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-		int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
+		int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
 		if (ret)
 			return ret;
 	}
@@ -1297,8 +1335,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
 static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
 {
 	return (struct bcachefs_fid) {
-		.inum	= inode->ei_inode.bi_inum,
-		.subvol	= inode->ei_subvol,
+		.inum	= inode->ei_inum.inum,
+		.subvol	= inode->ei_inum.subvol,
 		.gen	= inode->ei_inode.bi_generation,
 	};
 }
@@ -1383,7 +1421,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	subvol_inum parent_inum = {
 		.subvol = inode->ei_inode.bi_parent_subvol ?:
-			inode->ei_subvol,
+			inode->ei_inum.subvol,
 		.inum = inode->ei_inode.bi_dir,
 	};
 
@@ -1419,7 +1457,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
 retry:
 	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 
@@ -1450,8 +1488,7 @@ retry:
 		if (ret)
 			goto err;
 
-		if (target.subvol	== inode->ei_subvol &&
-		    target.inum		== inode->ei_inode.bi_inum)
+		if (subvol_inum_eq(target, inode->ei_inum))
 			goto found;
 	} else {
 		/*
@@ -1472,8 +1509,7 @@ retry:
 			if (ret)
 				continue;
 
-			if (target.subvol	== inode->ei_subvol &&
-			    target.inum		== inode->ei_inode.bi_inum)
+			if (subvol_inum_eq(target, inode->ei_inum))
 				goto found;
 		}
 	}
@@ -1510,7 +1546,9 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 				struct bch_inode_unpacked *bi,
 				struct bch_subvolume *subvol)
 {
-	bch2_iget5_set(&inode->v, &inum);
+	inode->v.i_ino		= inum.inum;
+	inode->ei_inum		= inum;
+	inode->ei_inode.bi_inum	= inum.inum;
 	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	inode->v.i_blocks	= bi->bi_sectors;
@@ -1522,7 +1560,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 	inode->ei_flags		= 0;
 	inode->ei_quota_reserved = 0;
 	inode->ei_qid		= bch_qid(bi);
-	inode->ei_subvol	= inum.subvol;
 
 	if (BCH_SUBVOLUME_SNAP(subvol))
 		set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@@ -1590,6 +1627,12 @@ static void bch2_evict_inode(struct inode *vinode)
 	struct bch_fs *c = vinode->i_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(vinode);
 
+	/*
+	 * evict() has waited for outstanding writeback, we'll do no more IO
+	 * through this inode: it's safe to remove from VFS inode hashtable here
+	 */
+	bch2_inode_hash_remove(c, inode);
+
 	truncate_inode_pages_final(&inode->v.i_data);
 
 	clear_inode(&inode->v);
@@ -1631,7 +1674,7 @@ again:
 
 	mutex_lock(&c->vfs_inodes_lock);
 	list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
-		if (!snapshot_list_has_id(s, inode->ei_subvol))
+		if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
 			continue;
 
 		if (!(inode->v.i_state & I_DONTCACHE) &&
@@ -2119,6 +2162,17 @@ static int bch2_init_fs_context(struct fs_context *fc)
 	return 0;
 }
 
+void bch2_fs_vfs_exit(struct bch_fs *c)
+{
+	if (c->vfs_inodes_table.tbl)
+		rhashtable_destroy(&c->vfs_inodes_table);
+}
+
+int bch2_fs_vfs_init(struct bch_fs *c)
+{
+	return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
+}
+
 static struct file_system_type bcache_fs_type = {
 	.owner			= THIS_MODULE,
 	.name			= "bcachefs",
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index c3af7225ff69..f8f8878121c3 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -13,6 +13,9 @@
 
 struct bch_inode_info {
 	struct inode		v;
+	struct rhash_head	hash;
+	subvol_inum		ei_inum;
+
 	struct list_head	ei_vfs_inode_list;
 	unsigned long		ei_flags;
 
@@ -24,8 +27,6 @@ struct bch_inode_info {
 	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;
 
-	u32			ei_subvol;
-
 	/*
 	 * When we've been doing nocow writes we'll need to issue flushes to the
 	 * underlying block devices
@@ -50,10 +51,7 @@ struct bch_inode_info {
 
 static inline subvol_inum inode_inum(struct bch_inode_info *inode)
 {
-	return (subvol_inum) {
-		.subvol	= inode->ei_subvol,
-		.inum	= inode->ei_inode.bi_inum,
-	};
+	return inode->ei_inum;
 }
 
 /*
@@ -67,6 +65,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
  * those:
  */
 #define EI_INODE_SNAPSHOT		1
+#define EI_INODE_HASHED			2
 
 #define to_bch_ei(_inode)					\
 	container_of_or_null(_inode, struct bch_inode_info, v)
@@ -187,6 +186,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
 
 void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
 
+void bch2_fs_vfs_exit(struct bch_fs *);
+int bch2_fs_vfs_init(struct bch_fs *);
+
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
@@ -196,6 +198,10 @@ int bch2_vfs_init(void);
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
+
+static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
+static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
+
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 2be6be33afa3..6ac0ff7e074b 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -365,7 +365,7 @@ int bch2_inode_peek(struct btree_trans *trans,
 		    subvol_inum inum, unsigned flags)
 {
 	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
-	bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+	bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
 	return ret;
 }
 
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 9b10c8947828..f2ec4277c2a5 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -30,7 +30,8 @@ struct snapshot_table {
 };
 
 typedef struct {
-	u32		subvol;
+	/* we can't have padding in this struct: */
+	u64		subvol;
 	u64		inum;
 } subvol_inum;
 
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index e7fa2de35014..6d0ce3d73450 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_fs_fs_io_direct_exit(c);
 	bch2_fs_fs_io_buffered_exit(c);
 	bch2_fs_fsio_exit(c);
+	bch2_fs_vfs_exit(c);
 	bch2_fs_ec_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_nocow_locking_exit(c);
@@ -926,6 +927,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_encryption_init(c) ?:
 	    bch2_fs_compress_init(c) ?:
 	    bch2_fs_ec_init(c) ?:
+	    bch2_fs_vfs_init(c) ?:
 	    bch2_fs_fsio_init(c) ?:
 	    bch2_fs_fs_io_buffered_init(c) ?:
 	    bch2_fs_fs_io_direct_init(c);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 331f944d73dc..4b5898d3eda0 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -306,7 +306,7 @@ retry:
 	bch2_trans_begin(trans);
 	iter = (struct btree_iter) { NULL };
 
-	ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
+	ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
 	if (ret)
 		goto err;
 

From d93e46b5826a52857133dd680fd11023ad3a3517 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Aug 2024 12:31:53 -0400
Subject: [PATCH 05/61] bcachefs: Fix deadlock in __wait_on_freeing_inode()

We can't call __wait_on_freeing_inode() with btree locks held; we're
waiting on another thread that's in evict(), and before it clears that
bit it needs to write that inode to flush timestamps - deadlock.

Fixing this involves a fair amount of re-jiggering to plumb a new
transaction restart.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 70 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 85b697041f32..eca0b073a1cf 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -185,12 +185,12 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	finish_wait(wq, &wait.wq_entry);
 }
 
-static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
+						   subvol_inum inum)
 {
 	struct bch_inode_info *inode;
 repeat:
-	inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum,
-				       bch2_vfs_inodes_params);
+	inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
 	if (inode) {
 		spin_lock(&inode->v.i_lock);
 		if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
@@ -198,7 +198,15 @@ repeat:
 			return NULL;
 		}
 		if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
-			__wait_on_freeing_inode(&inode->v);
+			if (!trans) {
+				__wait_on_freeing_inode(&inode->v);
+			} else {
+				bch2_trans_unlock(trans);
+				__wait_on_freeing_inode(&inode->v);
+				int ret = bch2_trans_relock(trans);
+				if (ret)
+					return ERR_PTR(ret);
+			}
 			goto repeat;
 		}
 		__iget(&inode->v);
@@ -222,7 +230,9 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod
 	}
 }
 
-static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, struct bch_inode_info *inode)
+static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
+						     struct btree_trans *trans,
+						     struct bch_inode_info *inode)
 {
 	struct bch_inode_info *old = inode;
 
@@ -231,7 +241,7 @@ retry:
 	if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
 					&inode->hash,
 					bch2_vfs_inodes_params))) {
-		old = bch2_inode_hash_find(c, inode->ei_inum);
+		old = bch2_inode_hash_find(c, trans, inode->ei_inum);
 		if (!old)
 			goto retry;
 
@@ -250,7 +260,7 @@ retry:
 		 */
 		set_nlink(&inode->v, 1);
 		discard_new_inode(&inode->v);
-		inode = old;
+		return old;
 	} else {
 		inode_fake_hash(&inode->v);
 
@@ -259,9 +269,8 @@ retry:
 		mutex_lock(&c->vfs_inodes_lock);
 		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
 		mutex_unlock(&c->vfs_inodes_lock);
+		return inode;
 	}
-
-	return inode;
 }
 
 #define memalloc_flags_do(_flags, _do)						\
@@ -321,9 +330,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
 	return inode;
 }
 
+static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
+							  subvol_inum inum,
+							  struct bch_inode_unpacked *bi,
+							  struct bch_subvolume *subvol)
+{
+	struct bch_inode_info *inode = bch2_new_inode(trans);
+	if (IS_ERR(inode))
+		return inode;
+
+	bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
+
+	return bch2_inode_hash_insert(trans->c, trans, inode);
+
+}
+
 struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
-	struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
+	struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
 	if (inode)
 		return &inode->v;
 
@@ -334,11 +358,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 	int ret = lockrestart_do(trans,
 		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
-		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
-	if (!ret) {
-		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-		inode = bch2_inode_hash_insert(c, inode);
-	}
+		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
 	bch2_trans_put(trans);
 
 	return ret ? ERR_PTR(ret) : &inode->v;
@@ -429,8 +449,16 @@ err_before_quota:
 	 * we must insert the new inode into the inode cache before calling
 	 * bch2_trans_exit() and dropping locks, else we could race with another
 	 * thread pulling the inode in and modifying it:
+	 *
+	 * also, calling bch2_inode_hash_insert() without passing in the
+	 * transaction object is sketchy - if we could ever end up in
+	 * __wait_on_freeing_inode(), we'd risk deadlock.
+	 *
+	 * But that shouldn't be possible, since we still have the inode locked
+	 * that we just created, and we _really_ can't take a transaction
+	 * restart here.
 	 */
-	inode = bch2_inode_hash_insert(c, inode);
+	inode = bch2_inode_hash_insert(c, NULL, inode);
 	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
@@ -470,7 +498,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	struct bch_inode_info *inode = bch2_inode_hash_find(c, inum);
+	struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
 	if (inode)
 		goto out;
 
@@ -478,7 +506,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 	struct bch_inode_unpacked inode_u;
 	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
 		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
-		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+		PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
 
 	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
 				c, "dirent to missing inode:\n  %s",
@@ -498,9 +526,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
 		ret = -ENOENT;
 		goto err;
 	}
-
-	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-	inode = bch2_inode_hash_insert(c, inode);
 out:
 	bch2_trans_iter_exit(trans, &dirent_iter);
 	printbuf_exit(&buf);
@@ -1541,7 +1566,8 @@ static const struct export_operations bch_export_ops = {
 	.get_name	= bch2_get_name,
 };
 
-static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
+static void bch2_vfs_inode_init(struct btree_trans *trans,
+				subvol_inum inum,
 				struct bch_inode_info *inode,
 				struct bch_inode_unpacked *bi,
 				struct bch_subvolume *subvol)

From d21ea4109244846ed6d77c1a532d89c2a5e1d44d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Aug 2024 11:47:32 -0400
Subject: [PATCH 06/61] bcachefs: journal_entry_replicas_not_marked is now
 autofix

errors that are known to always be safe to fix should be autofix: this
should be most errors even at this point, but that will need some
thorough review.

note that errors are still logged in the superblock, so we'll still know
that they happened.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d3a498617303..317602017ec7 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -23,7 +23,7 @@ enum bch_fsck_flags {
 	x(jset_past_bucket_end,					  9,	0)		\
 	x(jset_seq_blacklisted,					 10,	0)		\
 	x(journal_entries_missing,				 11,	0)		\
-	x(journal_entry_replicas_not_marked,			 12,	0)		\
+	x(journal_entry_replicas_not_marked,			 12,	FSCK_AUTOFIX)	\
 	x(journal_entry_past_jset_end,				 13,	0)		\
 	x(journal_entry_replicas_data_mismatch,			 14,	0)		\
 	x(journal_entry_bkey_u64s_0,				 15,	0)		\

From 6a3f1c99ce6893806d3df6c1381fd217f1e31c8e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 17 Jun 2024 19:00:33 -0400
Subject: [PATCH 07/61] lib/generic-radix-tree.c: genradix_ptr_inlined()

Provide an inlined fast path

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/generic-radix-tree.h | 75 ++++++++++++++++++++++++++++++
 lib/generic-radix-tree.c           | 64 +------------------------
 2 files changed, 76 insertions(+), 63 deletions(-)

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index f3512fddf3d7..8a3e1e886d1c 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -48,6 +48,49 @@ struct genradix_root;
 #define GENRADIX_NODE_SHIFT	9
 #define GENRADIX_NODE_SIZE	(1U << GENRADIX_NODE_SHIFT)
 
+#define GENRADIX_ARY		(GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
+#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
+
+/* depth that's needed for a genradix that can address up to ULONG_MAX: */
+#define GENRADIX_MAX_DEPTH	\
+	DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
+
+#define GENRADIX_DEPTH_MASK				\
+	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
+
+static inline int genradix_depth_shift(unsigned depth)
+{
+	return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
+}
+
+/*
+ * Returns size (of data, in bytes) that a tree of a given depth holds:
+ */
+static inline size_t genradix_depth_size(unsigned depth)
+{
+	return 1UL << genradix_depth_shift(depth);
+}
+
+static inline unsigned genradix_root_to_depth(struct genradix_root *r)
+{
+	return (unsigned long) r & GENRADIX_DEPTH_MASK;
+}
+
+static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
+{
+	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
+}
+
+struct genradix_node {
+	union {
+		/* Interior node: */
+		struct genradix_node	*children[GENRADIX_ARY];
+
+		/* Leaf: */
+		u8			data[GENRADIX_NODE_SIZE];
+	};
+};
+
 struct __genradix {
 	struct genradix_root		*root;
 };
@@ -128,6 +171,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
 #define __genradix_idx_to_offset(_radix, _idx)			\
 	__idx_to_offset(_idx, __genradix_obj_size(_radix))
 
+static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
+{
+	struct genradix_root *r = READ_ONCE(radix->root);
+	struct genradix_node *n = genradix_root_to_node(r);
+	unsigned level		= genradix_root_to_depth(r);
+	unsigned shift		= genradix_depth_shift(level);
+
+	if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
+		return NULL;
+
+	while (n && shift > GENRADIX_NODE_SHIFT) {
+		shift -= GENRADIX_ARY_SHIFT;
+		n = n->children[offset >> shift];
+		offset &= (1UL << shift) - 1;
+	}
+
+	return n ? &n->data[offset] : NULL;
+}
+
+#define genradix_ptr_inlined(_radix, _idx)			\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_inlined(&(_radix)->tree,		\
+			__genradix_idx_to_offset(_radix, _idx)))
+
 void *__genradix_ptr(struct __genradix *, size_t);
 
 /**
@@ -144,6 +211,14 @@ void *__genradix_ptr(struct __genradix *, size_t);
 
 void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 
+#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp)			\
+	(__genradix_cast(_radix)					\
+	 (__genradix_ptr_inlined(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)) ?:	\
+	  __genradix_ptr_alloc(&(_radix)->tree,				\
+			__genradix_idx_to_offset(_radix, _idx),		\
+			_gfp)))
+
 /**
  * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
  *			if necessary
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index fa692c86f069..4efae0663049 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -5,75 +5,13 @@
 #include <linux/gfp.h>
 #include <linux/kmemleak.h>
 
-#define GENRADIX_ARY		(GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
-#define GENRADIX_ARY_SHIFT	ilog2(GENRADIX_ARY)
-
-struct genradix_node {
-	union {
-		/* Interior node: */
-		struct genradix_node	*children[GENRADIX_ARY];
-
-		/* Leaf: */
-		u8			data[GENRADIX_NODE_SIZE];
-	};
-};
-
-static inline int genradix_depth_shift(unsigned depth)
-{
-	return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
-}
-
-/*
- * Returns size (of data, in bytes) that a tree of a given depth holds:
- */
-static inline size_t genradix_depth_size(unsigned depth)
-{
-	return 1UL << genradix_depth_shift(depth);
-}
-
-/* depth that's needed for a genradix that can address up to ULONG_MAX: */
-#define GENRADIX_MAX_DEPTH	\
-	DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
-
-#define GENRADIX_DEPTH_MASK				\
-	((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
-
-static inline unsigned genradix_root_to_depth(struct genradix_root *r)
-{
-	return (unsigned long) r & GENRADIX_DEPTH_MASK;
-}
-
-static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
-{
-	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
-}
-
 /*
  * Returns pointer to the specified byte @offset within @radix, or NULL if not
  * allocated
  */
 void *__genradix_ptr(struct __genradix *radix, size_t offset)
 {
-	struct genradix_root *r = READ_ONCE(radix->root);
-	struct genradix_node *n = genradix_root_to_node(r);
-	unsigned level		= genradix_root_to_depth(r);
-
-	if (ilog2(offset) >= genradix_depth_shift(level))
-		return NULL;
-
-	while (1) {
-		if (!n)
-			return NULL;
-		if (!level)
-			break;
-
-		level--;
-
-		n = n->children[offset >> genradix_depth_shift(level)];
-		offset &= genradix_depth_size(level) - 1;
-	}
-
-	return &n->data[offset];
+	return __genradix_ptr_inlined(radix, offset);
 }
 EXPORT_SYMBOL(__genradix_ptr);
 

From 71b20e091f1e0c2f2fc419437c45925c9bb92bf5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Aug 2024 23:14:44 -0400
Subject: [PATCH 08/61] lib/generic-radix-tree.c: add preallocation

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/generic-radix-tree.h | 38 +++++++++++++++++++++++++-----
 lib/generic-radix-tree.c           | 16 ++++---------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
index 8a3e1e886d1c..5b51c3d582d6 100644
--- a/include/linux/generic-radix-tree.h
+++ b/include/linux/generic-radix-tree.h
@@ -41,6 +41,7 @@
 #include <linux/limits.h>
 #include <linux/log2.h>
 #include <linux/math.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 
 struct genradix_root;
@@ -81,6 +82,10 @@ static inline struct genradix_node *genradix_root_to_node(struct genradix_root *
 	return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
 }
 
+struct __genradix {
+	struct genradix_root		*root;
+};
+
 struct genradix_node {
 	union {
 		/* Interior node: */
@@ -91,9 +96,15 @@ struct genradix_node {
 	};
 };
 
-struct __genradix {
-	struct genradix_root		*root;
-};
+static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
+{
+	return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
+}
+
+static inline void genradix_free_node(struct genradix_node *node)
+{
+	kfree(node);
+}
 
 /*
  * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
@@ -209,7 +220,8 @@ void *__genradix_ptr(struct __genradix *, size_t);
 	 __genradix_ptr(&(_radix)->tree,			\
 			__genradix_idx_to_offset(_radix, _idx)))
 
-void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
+void *__genradix_ptr_alloc(struct __genradix *, size_t,
+			   struct genradix_node **, gfp_t);
 
 #define genradix_ptr_alloc_inlined(_radix, _idx, _gfp)			\
 	(__genradix_cast(_radix)					\
@@ -217,7 +229,15 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 			__genradix_idx_to_offset(_radix, _idx)) ?:	\
 	  __genradix_ptr_alloc(&(_radix)->tree,				\
 			__genradix_idx_to_offset(_radix, _idx),		\
-			_gfp)))
+			NULL, _gfp)))
+
+#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
+	(__genradix_cast(_radix)					\
+	 (__genradix_ptr_inlined(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx)) ?:	\
+	  __genradix_ptr_alloc(&(_radix)->tree,				\
+			__genradix_idx_to_offset(_radix, _idx),		\
+			_new_node, _gfp)))
 
 /**
  * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@@ -232,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
 	(__genradix_cast(_radix)				\
 	 __genradix_ptr_alloc(&(_radix)->tree,			\
 			__genradix_idx_to_offset(_radix, _idx),	\
-			_gfp))
+			NULL, _gfp))
+
+#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
+	(__genradix_cast(_radix)				\
+	 __genradix_ptr_alloc(&(_radix)->tree,			\
+			__genradix_idx_to_offset(_radix, _idx),	\
+			_new_node, _gfp))
 
 struct genradix_iter {
 	size_t			offset;
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
index 4efae0663049..79e067b51488 100644
--- a/lib/generic-radix-tree.c
+++ b/lib/generic-radix-tree.c
@@ -15,27 +15,21 @@ void *__genradix_ptr(struct __genradix *radix, size_t offset)
 }
 EXPORT_SYMBOL(__genradix_ptr);
 
-static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
-{
-	return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
-}
-
-static inline void genradix_free_node(struct genradix_node *node)
-{
-	kfree(node);
-}
-
 /*
  * Returns pointer to the specified byte @offset within @radix, allocating it if
  * necessary - newly allocated slots are always zeroed out:
  */
 void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
+			   struct genradix_node **preallocated,
 			   gfp_t gfp_mask)
 {
 	struct genradix_root *v = READ_ONCE(radix->root);
 	struct genradix_node *n, *new_node = NULL;
 	unsigned level;
 
+	if (preallocated)
+		swap(new_node, *preallocated);
+
 	/* Increase tree depth if necessary: */
 	while (1) {
 		struct genradix_root *r = v, *new_root;
@@ -219,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
 	size_t offset;
 
 	for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
-		if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
+		if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
 			return -ENOMEM;
 
 	return 0;

From 24acdef9bed974fd139fa446efdd1043f7768605 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 10 Jun 2024 20:47:03 -0400
Subject: [PATCH 09/61] bcachefs: rcu_pending

Generic data structure for explicitly tracking pending RCU items,
allowing items to be dequeued (i.e. allocate from items pending
freeing). Works with conventional RCU and SRCU, and possibly other RCU
flavors in the future, meaning this can serve as a more generic
replacement for SLAB_TYPESAFE_BY_RCU.

Pending items are tracked in radix trees; if memory allocation fails, we
fall back to linked lists.

A rcu_pending is initialized with a callback, which is invoked when
pending items's grace periods have expired. Two types of callback
processing are handled specially:

- RCU_PENDING_KVFREE_FN

  New backend for kvfree_rcu(). Slightly faster, and eliminates the
  synchronize_rcu() slowpath in kvfree_rcu_mightsleep() - instead, an
  rcu_head is allocated if we don't have one and can't use the radix
  tree

  TODO:
  - add a shrinker (as in the existing kvfree_rcu implementation) so that
    memory reclaim can free expired objects if callback processing isn't
    keeping up, and to expedite a grace period if we're under memory
    pressure and too much memory is stranded by RCU

  - add a counter for amount of memory pending

- RCU_PENDING_CALL_RCU_FN

  Accelerated backend for call_rcu() - pending callbacks are tracked in
  a radix tree to eliminate linked list overhead.

to serve as replacement backends for kvfree_rcu() and call_rcu(); these
may be of interest to other uses (e.g. SLAB_TYPESAFE_BY_RCU users).

Note:

Internally, we're using a single rearming call_rcu() callback for
notifications from the core RCU subsystem for notifications when objects
are ready to be processed.

Ideally we would be getting a callback every time a grace period
completes for which we have objects, but that would require multiple
rcu_heads in flight, and since the number of gp sequence numbers with
uncompleted callbacks is not bounded, we can't do that yet.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Makefile      |   1 +
 fs/bcachefs/rcu_pending.c | 603 ++++++++++++++++++++++++++++++++++++++
 fs/bcachefs/rcu_pending.h |  25 ++
 3 files changed, 629 insertions(+)
 create mode 100644 fs/bcachefs/rcu_pending.c
 create mode 100644 fs/bcachefs/rcu_pending.h

diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 0ab533a2b03b..56d20e219f59 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -69,6 +69,7 @@ bcachefs-y		:=	\
 	printbuf.o		\
 	quota.o			\
 	rebalance.o		\
+	rcu_pending.o		\
 	recovery.o		\
 	recovery_passes.o	\
 	reflink.o		\
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
new file mode 100644
index 000000000000..8f8d914d3998
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.c
@@ -0,0 +1,603 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+
+#include <linux/generic-radix-tree.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/vmalloc.h>
+
+#include "rcu_pending.h"
+#include "darray.h"
+#include "util.h"
+
+#define static_array_for_each(_a, _i)			\
+	for (typeof(&(_a)[0]) _i = _a;			\
+	     _i < (_a) + ARRAY_SIZE(_a);		\
+	     _i++)
+
+enum rcu_pending_special {
+	RCU_PENDING_KVFREE	= 1,
+	RCU_PENDING_CALL_RCU	= 2,
+};
+
+#define RCU_PENDING_KVFREE_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
+#define RCU_PENDING_CALL_RCU_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
+
+static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
+{
+	return ssp
+		? get_state_synchronize_srcu(ssp)
+		: get_state_synchronize_rcu();
+}
+
+static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
+{
+	return ssp
+		? start_poll_synchronize_srcu(ssp)
+		: start_poll_synchronize_rcu();
+}
+
+static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+	return ssp
+		? poll_state_synchronize_srcu(ssp, cookie)
+		: poll_state_synchronize_rcu(cookie);
+}
+
+static inline void __rcu_barrier(struct srcu_struct *ssp)
+{
+	return ssp
+		? srcu_barrier(ssp)
+		: rcu_barrier();
+}
+
+static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+			      rcu_callback_t func)
+{
+	if (ssp)
+		call_srcu(ssp, rhp, func);
+	else
+		call_rcu(rhp, func);
+}
+
+struct rcu_pending_seq {
+	/*
+	 * We're using a radix tree like a vector - we're just pushing elements
+	 * onto the end; we're using a radix tree instead of an actual vector to
+	 * avoid reallocation overhead
+	 */
+	GENRADIX(struct rcu_head *)	objs;
+	size_t				nr;
+	struct rcu_head			**cursor;
+	unsigned long			seq;
+};
+
+struct rcu_pending_list {
+	struct rcu_head			*head;
+	struct rcu_head			*tail;
+	unsigned long			seq;
+};
+
+struct rcu_pending_pcpu {
+	struct rcu_pending		*parent;
+	spinlock_t			lock;
+	int				cpu;
+
+	/*
+	 * We can't bound the number of unprocessed gp sequence numbers, and we
+	 * can't efficiently merge radix trees for expired grace periods, so we
+	 * need darray/vector:
+	 */
+	DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
+
+	/* Third entry is for expired objects: */
+	struct rcu_pending_list		lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
+
+	struct rcu_head			cb;
+	bool				cb_armed;
+	struct work_struct		work;
+};
+
+static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
+{
+	if (p->objs.nr)
+		return true;
+
+	static_array_for_each(p->lists, i)
+		if (i->head)
+			return true;
+
+	return false;
+}
+
+static void rcu_pending_list_merge(struct rcu_pending_list *l1,
+				   struct rcu_pending_list *l2)
+{
+	if (!l1->head)
+		l1->head = l2->head;
+	else
+		l1->tail->next = l2->head;
+	l1->tail = l2->tail;
+
+	l2->head = l2->tail = NULL;
+}
+
+static void rcu_pending_list_add(struct rcu_pending_list *l,
+				 struct rcu_head *n)
+{
+	if (!l->head)
+		l->head = n;
+	else
+		l->tail->next = n;
+	l->tail = n;
+	n->next = NULL;
+}
+
+static void merge_expired_lists(struct rcu_pending_pcpu *p)
+{
+	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+	for (struct rcu_pending_list *i = p->lists; i < expired; i++)
+		if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
+			rcu_pending_list_merge(expired, i);
+}
+
+static noinline void __process_finished_items(struct rcu_pending *pending,
+					      struct rcu_pending_pcpu *p,
+					      unsigned long flags)
+{
+	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+	struct rcu_pending_seq objs = {};
+	struct rcu_head *list = NULL;
+
+	if (p->objs.nr &&
+	    __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
+		objs = p->objs.data[0];
+		darray_remove_item(&p->objs, p->objs.data);
+	}
+
+	merge_expired_lists(p);
+
+	list = expired->head;
+	expired->head = expired->tail = NULL;
+
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	switch ((ulong) pending->process) {
+	case RCU_PENDING_KVFREE:
+		for (size_t i = 0; i < objs.nr; ) {
+			size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
+
+			kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
+			i += nr_this_node;
+		}
+		genradix_free(&objs.objs);
+
+		while (list) {
+			struct rcu_head *obj = list;
+			list = obj->next;
+
+			/*
+			 * low bit of pointer indicates whether rcu_head needs
+			 * to be freed - kvfree_rcu_mightsleep()
+			 */
+			BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
+
+			void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
+			bool free_head = ((unsigned long) obj->func) & 1UL;
+
+			kvfree(ptr);
+			if (free_head)
+				kfree(obj);
+		}
+
+		break;
+
+	case RCU_PENDING_CALL_RCU:
+		for (size_t i = 0; i < objs.nr; i++) {
+			struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
+			obj->func(obj);
+		}
+		genradix_free(&objs.objs);
+
+		while (list) {
+			struct rcu_head *obj = list;
+			list = obj->next;
+			obj->func(obj);
+		}
+		break;
+
+	default:
+		for (size_t i = 0; i < objs.nr; i++)
+			pending->process(pending, *genradix_ptr(&objs.objs, i));
+		genradix_free(&objs.objs);
+
+		while (list) {
+			struct rcu_head *obj = list;
+			list = obj->next;
+			pending->process(pending, obj);
+		}
+		break;
+	}
+}
+
+static bool process_finished_items(struct rcu_pending *pending,
+				   struct rcu_pending_pcpu *p,
+				   unsigned long flags)
+{
+	/*
+	 * XXX: we should grab the gp seq once and avoid multiple function
+	 * calls, this is called from __rcu_pending_enqueue() fastpath in
+	 * may_sleep==true mode
+	 */
+	if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
+	    (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
+	    (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
+	    p->lists[2].head) {
+		__process_finished_items(pending, p, flags);
+		return true;
+	}
+
+	return false;
+}
+
+static void rcu_pending_work(struct work_struct *work)
+{
+	struct rcu_pending_pcpu *p =
+		container_of(work, struct rcu_pending_pcpu, work);
+	struct rcu_pending *pending = p->parent;
+	unsigned long flags;
+
+	do {
+		spin_lock_irqsave(&p->lock, flags);
+	} while (process_finished_items(pending, p, flags));
+
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static void rcu_pending_rcu_cb(struct rcu_head *rcu)
+{
+	struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
+
+	schedule_work_on(p->cpu, &p->work);
+
+	unsigned long flags;
+	spin_lock_irqsave(&p->lock, flags);
+	if (__rcu_pending_has_pending(p))
+		__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
+	else
+		p->cb_armed = false;
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static __always_inline struct rcu_pending_seq *
+get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
+{
+	darray_for_each_reverse(p->objs, objs)
+		if (objs->seq == seq)
+			return objs;
+
+	if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
+		return NULL;
+
+	return &darray_last(p->objs);
+}
+
+static noinline bool
+rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
+			 struct rcu_head *head, void *ptr,
+			 unsigned long *flags)
+{
+	if (ptr) {
+		if (!head) {
+			/*
+			 * kvfree_rcu_mightsleep(): we weren't passed an
+			 * rcu_head, but we need one: use the low bit of the
+			 * ponter to free to flag that the head needs to be
+			 * freed as well:
+			 */
+			ptr = (void *)(((unsigned long) ptr)|1UL);
+			head = kmalloc(sizeof(*head), __GFP_NOWARN);
+			if (!head) {
+				spin_unlock_irqrestore(&p->lock, *flags);
+				head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
+				/*
+				 * dropped lock, did GFP_KERNEL allocation,
+				 * check for gp expiration
+				 */
+				if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
+					kvfree(--ptr);
+					kfree(head);
+					spin_lock_irqsave(&p->lock, *flags);
+					return false;
+				}
+			}
+		}
+
+		head->func = ptr;
+	}
+again:
+	for (struct rcu_pending_list *i = p->lists;
+	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+		if (i->seq == seq) {
+			rcu_pending_list_add(i, head);
+			return false;
+		}
+	}
+
+	for (struct rcu_pending_list *i = p->lists;
+	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
+		if (!i->head) {
+			i->seq = seq;
+			rcu_pending_list_add(i, head);
+			return true;
+		}
+	}
+
+	merge_expired_lists(p);
+	goto again;
+}
+
+/*
+ * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
+ * pending->pracess) once grace period elapses.
+ *
+ * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
+ * back to a linked list.
+ *
+ * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
+ *   process callback
+ *
+ * - If @ptr and @head are both not NULL, we're kvfree_rcu()
+ *
+ * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
+ *
+ * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
+ *   expired items.
+ */
+static __always_inline void
+__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
+		      void *ptr, bool may_sleep)
+{
+
+	struct rcu_pending_pcpu *p;
+	struct rcu_pending_seq *objs;
+	struct genradix_node *new_node = NULL;
+	unsigned long seq, flags;
+	bool start_gp = false;
+
+	BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
+
+	local_irq_save(flags);
+	p = this_cpu_ptr(pending->p);
+	spin_lock(&p->lock);
+	seq = __get_state_synchronize_rcu(pending->srcu);
+restart:
+	if (may_sleep &&
+	    unlikely(process_finished_items(pending, p, flags)))
+		goto check_expired;
+
+	/*
+	 * In kvfree_rcu() mode, the radix tree is only for slab pointers so
+	 * that we can do kfree_bulk() - vmalloc pointers always use the linked
+	 * list:
+	 */
+	if (ptr && unlikely(is_vmalloc_addr(ptr)))
+		goto list_add;
+
+	objs = get_object_radix(p, seq);
+	if (unlikely(!objs))
+		goto list_add;
+
+	if (unlikely(!objs->cursor)) {
+		/*
+		 * New radix tree nodes must be added under @p->lock because the
+		 * tree root is in a darray that can be resized (typically,
+		 * genradix supports concurrent unlocked allocation of new
+		 * nodes) - hence preallocation and the retry loop:
+		 */
+		objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
+						objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
+		if (unlikely(!objs->cursor)) {
+			if (may_sleep) {
+				spin_unlock_irqrestore(&p->lock, flags);
+
+				gfp_t gfp = GFP_KERNEL;
+				if (!head)
+					gfp |= __GFP_NOFAIL;
+
+				new_node = genradix_alloc_node(gfp);
+				if (!new_node)
+					may_sleep = false;
+				goto check_expired;
+			}
+list_add:
+			start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
+			goto start_gp;
+		}
+	}
+
+	*objs->cursor++ = ptr ?: head;
+	/* zero cursor if we hit the end of a radix tree node: */
+	if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
+		objs->cursor = NULL;
+	start_gp = !objs->nr;
+	objs->nr++;
+start_gp:
+	if (unlikely(start_gp)) {
+		/*
+		 * We only have one callback (ideally, we would have one for
+		 * every outstanding graceperiod) - so if our callback is
+		 * already in flight, we may still have to start a grace period
+		 * (since we used get_state() above, not start_poll())
+		 */
+		if (!p->cb_armed) {
+			p->cb_armed = true;
+			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
+		} else {
+			__start_poll_synchronize_rcu(pending->srcu);
+		}
+	}
+	spin_unlock_irqrestore(&p->lock, flags);
+free_node:
+	if (new_node)
+		genradix_free_node(new_node);
+	return;
+check_expired:
+	if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
+		switch ((ulong) pending->process) {
+		case RCU_PENDING_KVFREE:
+			kvfree(ptr);
+			break;
+		case RCU_PENDING_CALL_RCU:
+			head->func(head);
+			break;
+		default:
+			pending->process(pending, head);
+			break;
+		}
+		goto free_node;
+	}
+
+	local_irq_save(flags);
+	p = this_cpu_ptr(pending->p);
+	spin_lock(&p->lock);
+	goto restart;
+}
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
+{
+	__rcu_pending_enqueue(pending, obj, NULL, true);
+}
+
+static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
+{
+	struct rcu_head *ret = NULL;
+
+	spin_lock_irq(&p->lock);
+	darray_for_each(p->objs, objs)
+		if (objs->nr) {
+			ret = *genradix_ptr(&objs->objs, --objs->nr);
+			objs->cursor = NULL;
+			if (!objs->nr)
+				genradix_free(&objs->objs);
+			goto out;
+		}
+
+	static_array_for_each(p->lists, i)
+		if (i->head) {
+			ret = i->head;
+			i->head = ret->next;
+			if (!i->head)
+				i->tail = NULL;
+			goto out;
+		}
+out:
+	spin_unlock_irq(&p->lock);
+
+	return ret;
+}
+
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
+{
+	return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
+}
+
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
+{
+	struct rcu_head *ret = rcu_pending_dequeue(pending);
+
+	if (ret)
+		return ret;
+
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
+		if (ret)
+			break;
+	}
+	return ret;
+}
+
+static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
+{
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		spin_lock_irq(&p->lock);
+		if (__rcu_pending_has_pending(p) || p->cb_armed) {
+			spin_unlock_irq(&p->lock);
+			return true;
+		}
+		spin_unlock_irq(&p->lock);
+	}
+
+	return false;
+}
+
+void rcu_pending_exit(struct rcu_pending *pending)
+{
+	int cpu;
+
+	if (!pending->p)
+		return;
+
+	while (rcu_pending_has_pending_or_armed(pending)) {
+		__rcu_barrier(pending->srcu);
+
+		for_each_possible_cpu(cpu) {
+			struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+			flush_work(&p->work);
+		}
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		flush_work(&p->work);
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+
+		static_array_for_each(p->lists, i)
+			WARN_ON(i->head);
+		WARN_ON(p->objs.nr);
+		darray_exit(&p->objs);
+	}
+	free_percpu(pending->p);
+}
+
+/**
+ * rcu_pending_init: - initialize a rcu_pending
+ *
+ * @pending:	Object to init
+ * @srcu:	May optionally be used with an srcu_struct; if NULL, uses normal
+ *		RCU flavor
+ * @process:	Callback function invoked on objects once their RCU barriers
+ *		have completed; if NULL, kvfree() is used.
+ */
+int rcu_pending_init(struct rcu_pending *pending,
+		     struct srcu_struct *srcu,
+		     rcu_pending_process_fn process)
+{
+	pending->p = alloc_percpu(struct rcu_pending_pcpu);
+	if (!pending->p)
+		return -ENOMEM;
+
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
+		p->parent	= pending;
+		p->cpu		= cpu;
+		spin_lock_init(&p->lock);
+		darray_init(&p->objs);
+		INIT_WORK(&p->work, rcu_pending_work);
+	}
+
+	pending->srcu = srcu;
+	pending->process = process;
+
+	return 0;
+}
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
new file mode 100644
index 000000000000..a875c640da8d
--- /dev/null
+++ b/fs/bcachefs/rcu_pending.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RCU_PENDING_H
+#define _LINUX_RCU_PENDING_H
+
+struct rcu_pending;
+typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
+
+struct rcu_pending_pcpu;
+
+struct rcu_pending {
+	struct rcu_pending_pcpu __percpu *p;
+	struct srcu_struct		*srcu;
+	rcu_pending_process_fn		process;
+};
+
+void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
+struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
+struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
+
+void rcu_pending_exit(struct rcu_pending *pending);
+int rcu_pending_init(struct rcu_pending *pending,
+		     struct srcu_struct *srcu,
+		     rcu_pending_process_fn process);
+
+#endif /* _LINUX_RCU_PENDING_H */

From c011161205a927085231851e946635b0d1f8b7eb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 8 Jun 2024 22:32:40 -0400
Subject: [PATCH 10/61] bcachefs: Rip out freelists from btree key cache

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c       | 369 +++++-----------------------
 fs/bcachefs/btree_key_cache_types.h |  14 --
 fs/bcachefs/btree_types.h           |   4 +-
 3 files changed, 57 insertions(+), 330 deletions(-)

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index fda7998734cb..dfaeb0810c5e 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -79,14 +79,24 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
 	return true;
 }
 
-static void bkey_cached_evict(struct btree_key_cache *c,
+static bool bkey_cached_evict(struct btree_key_cache *c,
 			      struct bkey_cached *ck)
 {
-	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
-				      bch2_btree_key_cache_params));
-	memset(&ck->key, ~0, sizeof(ck->key));
+	bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
+				      bch2_btree_key_cache_params);
+	if (ret) {
+		memset(&ck->key, ~0, sizeof(ck->key));
+		atomic_long_dec(&c->nr_keys);
+	}
 
-	atomic_long_dec(&c->nr_keys);
+	return ret;
+}
+
+static void __bkey_cached_free(struct rcu_head *rcu)
+{
+	struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
+
+	kmem_cache_free(bch2_key_cache, ck);
 }
 
 static void bkey_cached_free(struct btree_key_cache *bc,
@@ -94,115 +104,14 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 
-	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
-	ck->btree_trans_barrier_seq =
-		start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
-	if (ck->c.lock.readers) {
-		list_move_tail(&ck->list, &bc->freed_pcpu);
-		bc->nr_freed_pcpu++;
-	} else {
-		list_move_tail(&ck->list, &bc->freed_nonpcpu);
-		bc->nr_freed_nonpcpu++;
-	}
-	atomic_long_inc(&bc->nr_freed);
-
 	kfree(ck->k);
 	ck->k		= NULL;
 	ck->u64s	= 0;
 
 	six_unlock_write(&ck->c.lock);
 	six_unlock_intent(&ck->c.lock);
-}
 
-#ifdef __KERNEL__
-static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
-						   struct bkey_cached *ck)
-{
-	struct bkey_cached *pos;
-
-	bc->nr_freed_nonpcpu++;
-
-	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
-		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
-				 pos->btree_trans_barrier_seq)) {
-			list_move(&ck->list, &pos->list);
-			return;
-		}
-	}
-
-	list_move(&ck->list, &bc->freed_nonpcpu);
-}
-#endif
-
-static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
-					 struct bkey_cached *ck)
-{
-	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
-
-	if (!ck->c.lock.readers) {
-#ifdef __KERNEL__
-		struct btree_key_cache_freelist *f;
-		bool freed = false;
-
-		preempt_disable();
-		f = this_cpu_ptr(bc->pcpu_freed);
-
-		if (f->nr < ARRAY_SIZE(f->objs)) {
-			f->objs[f->nr++] = ck;
-			freed = true;
-		}
-		preempt_enable();
-
-		if (!freed) {
-			mutex_lock(&bc->lock);
-			preempt_disable();
-			f = this_cpu_ptr(bc->pcpu_freed);
-
-			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
-				struct bkey_cached *ck2 = f->objs[--f->nr];
-
-				__bkey_cached_move_to_freelist_ordered(bc, ck2);
-			}
-			preempt_enable();
-
-			__bkey_cached_move_to_freelist_ordered(bc, ck);
-			mutex_unlock(&bc->lock);
-		}
-#else
-		mutex_lock(&bc->lock);
-		list_move_tail(&ck->list, &bc->freed_nonpcpu);
-		bc->nr_freed_nonpcpu++;
-		mutex_unlock(&bc->lock);
-#endif
-	} else {
-		mutex_lock(&bc->lock);
-		list_move_tail(&ck->list, &bc->freed_pcpu);
-		bc->nr_freed_pcpu++;
-		mutex_unlock(&bc->lock);
-	}
-}
-
-static void bkey_cached_free_fast(struct btree_key_cache *bc,
-				  struct bkey_cached *ck)
-{
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
-	ck->btree_trans_barrier_seq =
-		start_poll_synchronize_srcu(&c->btree_trans_barrier);
-
-	list_del_init(&ck->list);
-	atomic_long_inc(&bc->nr_freed);
-
-	kfree(ck->k);
-	ck->k		= NULL;
-	ck->u64s	= 0;
-
-	bkey_cached_move_to_freelist(bc, ck);
-
-	six_unlock_write(&ck->c.lock);
-	six_unlock_intent(&ck->c.lock);
+	call_srcu(&c->btree_trans_barrier, &ck->rcu, __bkey_cached_free);
 }
 
 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
@@ -222,78 +131,10 @@ static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
 static struct bkey_cached *
 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
 {
-	struct bch_fs *c = trans->c;
-	struct btree_key_cache *bc = &c->btree_key_cache;
-	struct bkey_cached *ck = NULL;
 	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 	int ret;
 
-	if (!pcpu_readers) {
-#ifdef __KERNEL__
-		struct btree_key_cache_freelist *f;
-
-		preempt_disable();
-		f = this_cpu_ptr(bc->pcpu_freed);
-		if (f->nr)
-			ck = f->objs[--f->nr];
-		preempt_enable();
-
-		if (!ck) {
-			mutex_lock(&bc->lock);
-			preempt_disable();
-			f = this_cpu_ptr(bc->pcpu_freed);
-
-			while (!list_empty(&bc->freed_nonpcpu) &&
-			       f->nr < ARRAY_SIZE(f->objs) / 2) {
-				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
-				list_del_init(&ck->list);
-				bc->nr_freed_nonpcpu--;
-				f->objs[f->nr++] = ck;
-			}
-
-			ck = f->nr ? f->objs[--f->nr] : NULL;
-			preempt_enable();
-			mutex_unlock(&bc->lock);
-		}
-#else
-		mutex_lock(&bc->lock);
-		if (!list_empty(&bc->freed_nonpcpu)) {
-			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
-			list_del_init(&ck->list);
-			bc->nr_freed_nonpcpu--;
-		}
-		mutex_unlock(&bc->lock);
-#endif
-	} else {
-		mutex_lock(&bc->lock);
-		if (!list_empty(&bc->freed_pcpu)) {
-			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
-			list_del_init(&ck->list);
-			bc->nr_freed_pcpu--;
-		}
-		mutex_unlock(&bc->lock);
-	}
-
-	if (ck) {
-		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
-		if (unlikely(ret)) {
-			bkey_cached_move_to_freelist(bc, ck);
-			return ERR_PTR(ret);
-		}
-
-		btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
-
-		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
-		if (unlikely(ret)) {
-			btree_node_unlock(trans, path, 0);
-			bkey_cached_move_to_freelist(bc, ck);
-			return ERR_PTR(ret);
-		}
-
-		return ck;
-	}
-
-	ck = allocate_dropping_locks(trans, ret,
+	struct bkey_cached *ck = allocate_dropping_locks(trans, ret,
 				     __bkey_cached_alloc(key_u64s, _gfp));
 	if (ret) {
 		if (ck)
@@ -305,7 +146,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
 	if (!ck)
 		return NULL;
 
-	INIT_LIST_HEAD(&ck->list);
 	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 
 	ck->c.cached = true;
@@ -322,21 +162,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
 	struct bkey_cached *ck;
 	unsigned i;
 
-	mutex_lock(&c->lock);
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 	for (i = 0; i < tbl->size; i++)
 		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 			    bkey_cached_lock_for_evict(ck)) {
-				bkey_cached_evict(c, ck);
-				goto out;
+				if (bkey_cached_evict(c, ck))
+					goto out;
+				six_unlock_write(&ck->c.lock);
+				six_unlock_intent(&ck->c.lock);
 			}
 		}
 	ck = NULL;
 out:
 	rcu_read_unlock();
-	mutex_unlock(&c->lock);
 	return ck;
 }
 
@@ -415,7 +255,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
 	path->uptodate = BTREE_ITER_UPTODATE;
 	return 0;
 err:
-	bkey_cached_free_fast(bc, ck);
+	bkey_cached_free(bc, ck);
 	mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
 
 	return ret;
@@ -611,8 +451,12 @@ evict:
 		}
 
 		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
-		bkey_cached_evict(&c->btree_key_cache, ck);
-		bkey_cached_free_fast(&c->btree_key_cache, ck);
+		if (bkey_cached_evict(&c->btree_key_cache, ck)) {
+			bkey_cached_free(&c->btree_key_cache, ck);
+		} else {
+			six_unlock_write(&ck->c.lock);
+			six_unlock_intent(&ck->c.lock);
+		}
 	}
 out:
 	bch2_trans_iter_exit(trans, &b_iter);
@@ -722,7 +566,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 	}
 
 	bkey_cached_evict(bc, ck);
-	bkey_cached_free_fast(bc, ck);
+	bkey_cached_free(bc, ck);
 
 	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@@ -735,48 +579,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	struct bch_fs *c = shrink->private_data;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bucket_table *tbl;
-	struct bkey_cached *ck, *t;
+	struct bkey_cached *ck;
 	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
-	unsigned start, flags;
+	unsigned iter, start;
 	int srcu_idx;
 
-	mutex_lock(&bc->lock);
-	bc->requested_to_free += sc->nr_to_scan;
-
 	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-	flags = memalloc_nofs_save();
-
-	/*
-	 * Newest freed entries are at the end of the list - once we hit one
-	 * that's too new to be freed, we can bail out:
-	 */
-	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
-		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
-						 ck->btree_trans_barrier_seq))
-			break;
-
-		list_del(&ck->list);
-		six_lock_exit(&ck->c.lock);
-		kmem_cache_free(bch2_key_cache, ck);
-		atomic_long_dec(&bc->nr_freed);
-		bc->nr_freed_nonpcpu--;
-		bc->freed++;
-	}
-
-	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
-		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
-						 ck->btree_trans_barrier_seq))
-			break;
-
-		list_del(&ck->list);
-		six_lock_exit(&ck->c.lock);
-		kmem_cache_free(bch2_key_cache, ck);
-		atomic_long_dec(&bc->nr_freed);
-		bc->nr_freed_pcpu--;
-		bc->freed++;
-	}
-
 	rcu_read_lock();
+
 	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 
 	/*
@@ -792,17 +602,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 		return SHRINK_STOP;
 	}
 
-	if (bc->shrink_iter >= tbl->size)
-		bc->shrink_iter = 0;
-	start = bc->shrink_iter;
+	iter = bc->shrink_iter;
+	if (iter >= tbl->size)
+		iter = 0;
+	start = iter;
 
 	do {
 		struct rhash_head *pos, *next;
 
-		pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
+		pos = rht_ptr_rcu(&tbl->buckets[iter]);
 
 		while (!rht_is_a_nulls(pos)) {
-			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
+			next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
 			ck = container_of(pos, struct bkey_cached, hash);
 
 			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -812,29 +623,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 				bc->skipped_accessed++;
 			} else if (!bkey_cached_lock_for_evict(ck)) {
 				bc->skipped_lock_fail++;
-			} else {
-				bkey_cached_evict(bc, ck);
+			} else if (bkey_cached_evict(bc, ck)) {
 				bkey_cached_free(bc, ck);
-				bc->moved_to_freelist++;
+				bc->freed++;
 				freed++;
+			} else {
+				six_unlock_write(&ck->c.lock);
+				six_unlock_intent(&ck->c.lock);
 			}
 
 			scanned++;
 			if (scanned >= nr)
-				break;
+				goto out;
 
 			pos = next;
 		}
 
-		bc->shrink_iter++;
-		if (bc->shrink_iter >= tbl->size)
-			bc->shrink_iter = 0;
-	} while (scanned < nr && bc->shrink_iter != start);
+		iter++;
+		if (iter >= tbl->size)
+			iter = 0;
+	} while (scanned < nr && iter != start);
+out:
+	bc->shrink_iter = iter;
 
 	rcu_read_unlock();
-	memalloc_nofs_restore(flags);
 	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
-	mutex_unlock(&bc->lock);
 
 	return freed;
 }
@@ -862,18 +675,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	struct bucket_table *tbl;
-	struct bkey_cached *ck, *n;
+	struct bkey_cached *ck;
 	struct rhash_head *pos;
 	LIST_HEAD(items);
 	unsigned i;
-#ifdef __KERNEL__
-	int cpu;
-#endif
 
 	shrinker_free(bc->shrink);
 
-	mutex_lock(&bc->lock);
-
 	/*
 	 * The loop is needed to guard against racing with rehash:
 	 */
@@ -892,44 +700,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 			for (i = 0; i < tbl->size; i++)
 				while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
 					ck = container_of(pos, struct bkey_cached, hash);
-					bkey_cached_evict(bc, ck);
-					list_add(&ck->list, &items);
+					BUG_ON(!bkey_cached_evict(bc, ck));
+					kfree(ck->k);
+					kmem_cache_free(bch2_key_cache, ck);
 				}
 		}
 		rcu_read_unlock();
 	}
 
-#ifdef __KERNEL__
-	if (bc->pcpu_freed) {
-		for_each_possible_cpu(cpu) {
-			struct btree_key_cache_freelist *f =
-				per_cpu_ptr(bc->pcpu_freed, cpu);
-
-			for (i = 0; i < f->nr; i++) {
-				ck = f->objs[i];
-				list_add(&ck->list, &items);
-			}
-		}
-	}
-#endif
-
-	BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
-	BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
-
-	list_splice(&bc->freed_pcpu,	&items);
-	list_splice(&bc->freed_nonpcpu,	&items);
-
-	mutex_unlock(&bc->lock);
-
-	list_for_each_entry_safe(ck, n, &items, list) {
-		cond_resched();
-
-		list_del(&ck->list);
-		kfree(ck->k);
-		six_lock_exit(&ck->c.lock);
-		kmem_cache_free(bch2_key_cache, ck);
-	}
-
 	if (atomic_long_read(&bc->nr_dirty) &&
 	    !bch2_journal_error(&c->journal) &&
 	    test_bit(BCH_FS_was_rw, &c->flags))
@@ -942,15 +720,10 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 	if (bc->table_init_done)
 		rhashtable_destroy(&bc->table);
-
-	free_percpu(bc->pcpu_freed);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 {
-	mutex_init(&c->lock);
-	INIT_LIST_HEAD(&c->freed_pcpu);
-	INIT_LIST_HEAD(&c->freed_nonpcpu);
 }
 
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@@ -958,12 +731,6 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	struct shrinker *shrink;
 
-#ifdef __KERNEL__
-	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
-	if (!bc->pcpu_freed)
-		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
-#endif
-
 	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
 		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 
@@ -984,45 +751,19 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
 {
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
 	printbuf_tabstop_push(out, 24);
 	printbuf_tabstop_push(out, 12);
 
-	unsigned flags = memalloc_nofs_save();
-	mutex_lock(&bc->lock);
 	prt_printf(out, "keys:\t%lu\r\n",		atomic_long_read(&bc->nr_keys));
 	prt_printf(out, "dirty:\t%lu\r\n",		atomic_long_read(&bc->nr_dirty));
-	prt_printf(out, "freelist:\t%lu\r\n",		atomic_long_read(&bc->nr_freed));
-	prt_printf(out, "nonpcpu freelist:\t%zu\r\n",	bc->nr_freed_nonpcpu);
-	prt_printf(out, "pcpu freelist:\t%zu\r\n",	bc->nr_freed_pcpu);
+	prt_printf(out, "table size:\t%u\r\n",		bc->table.tbl->size);
 
 	prt_printf(out, "\nshrinker:\n");
 	prt_printf(out, "requested_to_free:\t%lu\r\n",	bc->requested_to_free);
 	prt_printf(out, "freed:\t%lu\r\n",		bc->freed);
-	prt_printf(out, "moved_to_freelist:\t%lu\r\n",	bc->moved_to_freelist);
 	prt_printf(out, "skipped_dirty:\t%lu\r\n",	bc->skipped_dirty);
 	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
 	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
-
-	prt_printf(out, "srcu seq:\t%lu\r\n",		get_state_synchronize_srcu(&c->btree_trans_barrier));
-
-	struct bkey_cached *ck;
-	unsigned iter = 0;
-	list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
-		prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
-		if (++iter > 10)
-			break;
-	}
-
-	iter = 0;
-	list_for_each_entry(ck, &bc->freed_pcpu, list) {
-		prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
-		if (++iter > 10)
-			break;
-	}
-	mutex_unlock(&bc->lock);
-	memalloc_flags_restore(flags);
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index 237e8bb3ac40..e026c65f54e1 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -2,33 +2,19 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
 #define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
 
-struct btree_key_cache_freelist {
-	struct bkey_cached	*objs[16];
-	unsigned		nr;
-};
-
 struct btree_key_cache {
-	struct mutex		lock;
 	struct rhashtable	table;
 	bool			table_init_done;
 
-	struct list_head	freed_pcpu;
-	size_t			nr_freed_pcpu;
-	struct list_head	freed_nonpcpu;
-	size_t			nr_freed_nonpcpu;
-
 	struct shrinker		*shrink;
 	unsigned		shrink_iter;
-	struct btree_key_cache_freelist __percpu *pcpu_freed;
 
-	atomic_long_t		nr_freed;
 	atomic_long_t		nr_keys;
 	atomic_long_t		nr_dirty;
 
 	/* shrinker stats */
 	unsigned long		requested_to_free;
 	unsigned long		freed;
-	unsigned long		moved_to_freelist;
 	unsigned long		skipped_dirty;
 	unsigned long		skipped_accessed;
 	unsigned long		skipped_lock_fail;
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index b256b2a20a4f..bca56b6359e7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -386,17 +386,17 @@ struct bkey_cached {
 	struct btree_bkey_cached_common c;
 
 	unsigned long		flags;
-	unsigned long		btree_trans_barrier_seq;
 	u16			u64s;
 	struct bkey_cached_key	key;
 
 	struct rhash_head	hash;
-	struct list_head	list;
 
 	struct journal_entry_pin journal;
 	u64			seq;
 
 	struct bkey_i		*k;
+
+	struct rcu_head		rcu;
 };
 
 static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)

From 03ad7db3340c15857037e7ddd2c1b87b64ff059a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 13 Jun 2024 15:35:47 -0400
Subject: [PATCH 11/61] bcachefs: key cache can now allocate from pending

btree_trans objects can hold the btree_trans_barrier srcu read lock for
an extended amount of time (they shouldn't, but it's difficult to
guarantee).

the srcu barrier blocks memory reclaim, so to avoid too many stranded
key cache items, this uses the new pending_rcu_items to allocate from
pending items - like we did before, but now without a global lock on the
key cache.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c       | 58 ++++++++++++++++++++++-------
 fs/bcachefs/btree_key_cache_types.h |  6 +++
 fs/bcachefs/btree_types.h           |  1 -
 3 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index dfaeb0810c5e..3048adde9284 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -92,18 +92,18 @@ static bool bkey_cached_evict(struct btree_key_cache *c,
 	return ret;
 }
 
-static void __bkey_cached_free(struct rcu_head *rcu)
+static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
 {
+	struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
 	struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
 
+	this_cpu_dec(*c->btree_key_cache.nr_pending);
 	kmem_cache_free(bch2_key_cache, ck);
 }
 
 static void bkey_cached_free(struct btree_key_cache *bc,
 			     struct bkey_cached *ck)
 {
-	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
-
 	kfree(ck->k);
 	ck->k		= NULL;
 	ck->u64s	= 0;
@@ -111,7 +111,9 @@ static void bkey_cached_free(struct btree_key_cache *bc,
 	six_unlock_write(&ck->c.lock);
 	six_unlock_intent(&ck->c.lock);
 
-	call_srcu(&c->btree_trans_barrier, &ck->rcu, __bkey_cached_free);
+	bool pcpu_readers = ck->c.lock.readers != NULL;
+	rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
+	this_cpu_inc(*bc->nr_pending);
 }
 
 static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
@@ -131,10 +133,18 @@ static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
 static struct bkey_cached *
 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_key_cache *bc = &c->btree_key_cache;
 	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 	int ret;
 
-	struct bkey_cached *ck = allocate_dropping_locks(trans, ret,
+	struct bkey_cached *ck = container_of_or_null(
+				rcu_pending_dequeue(&bc->pending[pcpu_readers]),
+				struct bkey_cached, rcu);
+	if (ck)
+		goto lock;
+
+	ck = allocate_dropping_locks(trans, ret,
 				     __bkey_cached_alloc(key_u64s, _gfp));
 	if (ret) {
 		if (ck)
@@ -143,14 +153,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
 		return ERR_PTR(ret);
 	}
 
-	if (!ck)
-		return NULL;
+	if (ck) {
+		bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+		ck->c.cached = true;
+		goto lock;
+	}
 
-	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
-
-	ck->c.cached = true;
-	BUG_ON(!six_trylock_intent(&ck->c.lock));
-	BUG_ON(!six_trylock_write(&ck->c.lock));
+	ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
+				  struct bkey_cached, rcu);
+	if (ck)
+		goto lock;
+lock:
+	six_lock_intent(&ck->c.lock, NULL, NULL);
+	six_lock_write(&ck->c.lock, NULL, NULL);
 	return ck;
 }
 
@@ -720,6 +735,11 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 	if (bc->table_init_done)
 		rhashtable_destroy(&bc->table);
+
+	rcu_pending_exit(&bc->pending[0]);
+	rcu_pending_exit(&bc->pending[1]);
+
+	free_percpu(bc->nr_pending);
 }
 
 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
@@ -731,6 +751,14 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 	struct shrinker *shrink;
 
+	bc->nr_pending = alloc_percpu(size_t);
+	if (!bc->nr_pending)
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
+	if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
+	    rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
+		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+
 	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
 		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
 
@@ -757,13 +785,15 @@ void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *
 	prt_printf(out, "keys:\t%lu\r\n",		atomic_long_read(&bc->nr_keys));
 	prt_printf(out, "dirty:\t%lu\r\n",		atomic_long_read(&bc->nr_dirty));
 	prt_printf(out, "table size:\t%u\r\n",		bc->table.tbl->size);
-
-	prt_printf(out, "\nshrinker:\n");
+	prt_newline(out);
+	prt_printf(out, "shrinker:\n");
 	prt_printf(out, "requested_to_free:\t%lu\r\n",	bc->requested_to_free);
 	prt_printf(out, "freed:\t%lu\r\n",		bc->freed);
 	prt_printf(out, "skipped_dirty:\t%lu\r\n",	bc->skipped_dirty);
 	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
 	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
+	prt_newline(out);
+	prt_printf(out, "pending:\t%lu\r\n",		per_cpu_sum(bc->nr_pending));
 }
 
 void bch2_btree_key_cache_exit(void)
diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h
index e026c65f54e1..722f1ed10551 100644
--- a/fs/bcachefs/btree_key_cache_types.h
+++ b/fs/bcachefs/btree_key_cache_types.h
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
 #define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
 
+#include "rcu_pending.h"
+
 struct btree_key_cache {
 	struct rhashtable	table;
 	bool			table_init_done;
@@ -9,6 +11,10 @@ struct btree_key_cache {
 	struct shrinker		*shrink;
 	unsigned		shrink_iter;
 
+	/* 0: non pcpu reader locks, 1: pcpu reader locks */
+	struct rcu_pending	pending[2];
+	size_t __percpu		*nr_pending;
+
 	atomic_long_t		nr_keys;
 	atomic_long_t		nr_dirty;
 
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index bca56b6359e7..0df07929c545 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -395,7 +395,6 @@ struct bkey_cached {
 	u64			seq;
 
 	struct bkey_i		*k;
-
 	struct rcu_head		rcu;
 };
 

From d6931058c0f85a1861d4151758a786dd06d103bb Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Wed, 21 Aug 2024 13:43:03 -0700
Subject: [PATCH 12/61] bcachefs: Fix format specifier in
 bch2_btree_key_cache_to_text()

When building for a 32-bit architecture, for which 'size_t' is
'unsigned int', there is a compiler warning due to use of '%lu':

  In file included from fs/bcachefs/vstructs.h:5,
                   from fs/bcachefs/bcachefs_format.h:80,
                   from fs/bcachefs/bcachefs.h:207,
                   from fs/bcachefs/btree_key_cache.c:3:
  fs/bcachefs/btree_key_cache.c: In function 'bch2_btree_key_cache_to_text':
  fs/bcachefs/btree_key_cache.c:795:25: error: format '%lu' expects argument of type 'long unsigned int', but argument 3 has type 'size_t' {aka 'unsigned int'} [-Werror=format=]
    795 |         prt_printf(out, "pending:\t%lu\r\n",            per_cpu_sum(bc->nr_pending));
        |                         ^~~~~~~~~~~~~~~~~~~
  fs/bcachefs/util.h:78:63: note: in definition of macro 'prt_printf'
     78 | #define prt_printf(_out, ...)           bch2_prt_printf(_out, __VA_ARGS__)
        |                                                               ^~~~~~~~~~~
  fs/bcachefs/btree_key_cache.c:795:38: note: format string is defined here
    795 |         prt_printf(out, "pending:\t%lu\r\n",            per_cpu_sum(bc->nr_pending));
        |                                    ~~^
        |                                      |
        |                                      long unsigned int
        |                                    %u
  cc1: all warnings being treated as errors

Use the proper specifier, '%zu', to resolve the warning.

Fixes: e447e49977b8 ("bcachefs: key cache can now allocate from pending")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 3048adde9284..2e49ca71194f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -793,7 +793,7 @@ void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *
 	prt_printf(out, "skipped_accessed:\t%lu\r\n",	bc->skipped_accessed);
 	prt_printf(out, "skipped_lock_fail:\t%lu\r\n",	bc->skipped_lock_fail);
 	prt_newline(out);
-	prt_printf(out, "pending:\t%lu\r\n",		per_cpu_sum(bc->nr_pending));
+	prt_printf(out, "pending:\t%zu\r\n",		per_cpu_sum(bc->nr_pending));
 }
 
 void bch2_btree_key_cache_exit(void)

From cea5247fae469b0089b8c3619c6fa96794ed5305 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Wed, 21 Aug 2024 18:29:22 +0200
Subject: [PATCH 13/61] bcachefs: Annotate struct bucket_array with
 __counted_by()

Add the __counted_by compiler attribute to the flexible array member
bucket to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index c9698cdf866f..a19460a1bb7c 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -24,7 +24,7 @@ struct bucket_array {
 	u16			first_bucket;
 	size_t			nbuckets;
 	size_t			nbuckets_minus_first;
-	struct bucket		b[];
+	struct bucket		b[] __counted_by(nbuckets);
 };
 
 struct bucket_gens {

From b6da5c9048ee5195712ab1bd7efa00260c4f83a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 15 Jul 2024 16:53:49 -0400
Subject: [PATCH 14/61] bcachefs: data_allowed is now an opts.h option

need this so cmd_option in userspace can handle it

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c | 9 +++++++++
 fs/bcachefs/opts.h | 6 ++++++
 fs/bcachefs/util.c | 2 +-
 fs/bcachefs/util.h | 2 +-
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e10fc1da71b1..8e2fd064b50f 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -230,6 +230,8 @@ const struct bch_option bch2_opt_table[] = {
 #define OPT_STR_NOLIMIT(_choices)	.type = BCH_OPT_STR,		\
 				.min = 0, .max = U64_MAX,		\
 				.choices = _choices
+#define OPT_BITFIELD(_choices)	.type = BCH_OPT_BITFIELD,		\
+				.choices = _choices
 #define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
 
 #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
@@ -376,6 +378,13 @@ int bch2_opt_parse(struct bch_fs *c,
 
 		*res = ret;
 		break;
+	case BCH_OPT_BITFIELD: {
+		s64 v = bch2_read_flag_list(val, opt->choices);
+		if (v < 0)
+			return v;
+		*res = v;
+		break;
+	}
 	case BCH_OPT_FN:
 		ret = opt->fn.parse(c, val, res, err);
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index cda1725702ea..1c7b68b25bb7 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -70,6 +70,7 @@ enum opt_type {
 	BCH_OPT_BOOL,
 	BCH_OPT_UINT,
 	BCH_OPT_STR,
+	BCH_OPT_BITFIELD,
 	BCH_OPT_FN,
 };
 
@@ -477,6 +478,11 @@ enum fsck_err_opts {
 	  BCH2_NO_SB_OPT,		1,				\
 	  "n",		"Data written to this device will be considered\n"\
 			"to have already been replicated n times")	\
+	x(data_allowed,			u8,				\
+	  OPT_DEVICE,							\
+	  OPT_BITFIELD(__bch2_data_types),				\
+	  BCH2_NO_SB_OPT,		BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+	  "types",	"Allowed data types for this device: journal, btree, and/or user")\
 	x(btree_node_prefetch,		u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 1b8554460af4..2acdfa783b5b 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -204,7 +204,7 @@ STRTO_H(strtoll, long long)
 STRTO_H(strtoull, unsigned long long)
 STRTO_H(strtou64, u64)
 
-u64 bch2_read_flag_list(char *opt, const char * const list[])
+u64 bch2_read_flag_list(const char *opt, const char * const list[])
 {
 	u64 ret = 0;
 	char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 902b7f5406a2..fb02c1c36004 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -195,7 +195,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
 
 bool bch2_is_zero(const void *, size_t);
 
-u64 bch2_read_flag_list(char *, const char * const[]);
+u64 bch2_read_flag_list(const char *, const char * const[]);
 
 void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
 void bch2_prt_u64_base2(struct printbuf *, u64);

From f44deb432db5f66b7667ee8ebed464665ddf767e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 15 Jul 2024 19:26:46 -0400
Subject: [PATCH 15/61] bcachefs: bch2_opt_set_sb() can now set (some) device
 options

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c  | 42 +++++++++++++++++++++++++++++++++---------
 fs/bcachefs/opts.h  | 10 ++++++++--
 fs/bcachefs/sysfs.c | 11 ++---------
 3 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 8e2fd064b50f..2e6e58360789 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -617,10 +617,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
 	return 0;
 }
 
-void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+struct bch_dev_sb_opt_set {
+	void			(*set_sb)(struct bch_member *, u64);
+};
+
+static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
+#define x(n, set)	[Opt_##n] = { .set_sb = SET_##set },
+	BCH_DEV_OPTS()
+#undef x
+};
+
+void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+		       const struct bch_option *opt, u64 v)
 {
-	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
-		return;
+	enum bch_opt_id id = opt - bch2_opt_table;
 
 	if (opt->flags & OPT_SB_FIELD_SECTORS)
 		v >>= 9;
@@ -628,16 +638,30 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
 	if (opt->flags & OPT_SB_FIELD_ILOG2)
 		v = ilog2(v);
 
-	opt->set_sb(sb, v);
+	if (opt->flags & OPT_FS) {
+		if (opt->set_sb != SET_BCH2_NO_SB_OPT)
+			opt->set_sb(sb, v);
+	}
+
+	if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+		if (WARN(!bch2_member_exists(sb, dev_idx),
+			 "tried to set device option %s on nonexistent device %i",
+			 opt->attr.name, dev_idx))
+			return;
+
+		struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
+
+		const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
+		if (set->set_sb)
+			set->set_sb(m, v);
+	}
 }
 
-void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+		     const struct bch_option *opt, u64 v)
 {
-	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
-		return;
-
 	mutex_lock(&c->sb_lock);
-	__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+	__bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 }
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1c7b68b25bb7..32f895dfb330 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -490,6 +490,10 @@ enum fsck_err_opts {
 	  NULL,		"BTREE_ITER_prefetch casuse btree nodes to be\n"\
 	  " prefetched sequentially")
 
+#define BCH_DEV_OPTS()							\
+	x(discard,		BCH_MEMBER_DISCARD)			\
+	x(data_allowed,		BCH_MEMBER_DATA_ALLOWED)
+
 struct bch_opts {
 #define x(_name, _bits, ...)	unsigned _name##_defined:1;
 	BCH_OPTS()
@@ -569,8 +573,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
 u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
 int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
-void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
+void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+
+struct bch_dev;
+void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
 
 int bch2_opt_lookup(const char *);
 int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f393023a3ae2..5869331b8387 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -674,7 +674,7 @@ STORE(bch2_fs_opts_dir)
 	if (ret < 0)
 		goto err;
 
-	bch2_opt_set_sb(c, opt, v);
+	bch2_opt_set_sb(c, NULL, opt, v);
 	bch2_opt_set_by_id(&c->opts, id, v);
 
 	if (v &&
@@ -826,14 +826,7 @@ STORE(bch2_dev)
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
 
-		mutex_lock(&c->sb_lock);
-		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
-		if (v != BCH_MEMBER_DISCARD(mi)) {
-			SET_BCH_MEMBER_DISCARD(mi, v);
-			bch2_write_super(c);
-		}
-		mutex_unlock(&c->sb_lock);
+		bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
 	}
 
 	if (attr == &sysfs_durability) {

From fcc50badf0971115e36e18bb5158eceb6d888f1b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 15 Jul 2024 19:54:51 -0400
Subject: [PATCH 16/61] bcachefs: Opt_durability can now be set via
 bch2_opt_set_sb()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/opts.c  |  7 ++++++-
 fs/bcachefs/opts.h  | 28 +++++++++++++++-------------
 fs/bcachefs/sysfs.c | 10 +---------
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 2e6e58360789..0770aebef6d8 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -623,7 +623,7 @@ struct bch_dev_sb_opt_set {
 
 static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
 #define x(n, set)	[Opt_##n] = { .set_sb = SET_##set },
-	BCH_DEV_OPTS()
+	BCH_DEV_OPT_SETTERS()
 #undef x
 };
 
@@ -638,6 +638,9 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
 	if (opt->flags & OPT_SB_FIELD_ILOG2)
 		v = ilog2(v);
 
+	if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+		v++;
+
 	if (opt->flags & OPT_FS) {
 		if (opt->set_sb != SET_BCH2_NO_SB_OPT)
 			opt->set_sb(sb, v);
@@ -654,6 +657,8 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
 		const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
 		if (set->set_sb)
 			set->set_sb(m, v);
+		else
+			pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
 	}
 }
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 32f895dfb330..0138717e9f77 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -53,17 +53,18 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
 
 /* When can be set: */
 enum opt_flags {
-	OPT_FS		= (1 << 0),	/* Filesystem option */
-	OPT_DEVICE	= (1 << 1),	/* Device option */
-	OPT_INODE	= (1 << 2),	/* Inode option */
-	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
-	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
-	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
-	OPT_HUMAN_READABLE = (1 << 6),
-	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
-	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
-	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
-	OPT_HIDDEN	= (1 << 10),
+	OPT_FS			= BIT(0),	/* Filesystem option */
+	OPT_DEVICE		= BIT(1),	/* Device option */
+	OPT_INODE		= BIT(2),	/* Inode option */
+	OPT_FORMAT		= BIT(3),	/* May be specified at format time */
+	OPT_MOUNT		= BIT(4),	/* May be specified at mount time */
+	OPT_RUNTIME		= BIT(5),	/* May be specified at runtime */
+	OPT_HUMAN_READABLE	= BIT(6),
+	OPT_MUST_BE_POW_2	= BIT(7),	/* Must be power of 2 */
+	OPT_SB_FIELD_SECTORS	= BIT(8),	/* Superblock field is >> 9 of actual value */
+	OPT_SB_FIELD_ILOG2	= BIT(9),	/* Superblock field is ilog2 of actual value */
+	OPT_SB_FIELD_ONE_BIAS	= BIT(10),	/* 0 means default value */
+	OPT_HIDDEN		= BIT(11),
 };
 
 enum opt_type {
@@ -473,7 +474,7 @@ enum fsck_err_opts {
 	  BCH2_NO_SB_OPT,		0,				\
 	  "size",	"Size of filesystem on device")			\
 	x(durability,			u8,				\
-	  OPT_DEVICE,							\
+	  OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS,				\
 	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
 	  BCH2_NO_SB_OPT,		1,				\
 	  "n",		"Data written to this device will be considered\n"\
@@ -490,8 +491,9 @@ enum fsck_err_opts {
 	  NULL,		"BTREE_ITER_prefetch casuse btree nodes to be\n"\
 	  " prefetched sequentially")
 
-#define BCH_DEV_OPTS()							\
+#define BCH_DEV_OPT_SETTERS()						\
 	x(discard,		BCH_MEMBER_DISCARD)			\
+	x(durability,		BCH_MEMBER_DURABILITY)			\
 	x(data_allowed,		BCH_MEMBER_DATA_ALLOWED)
 
 struct bch_opts {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 5869331b8387..ce321e2097c0 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -821,7 +821,6 @@ STORE(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	struct bch_member *mi;
 
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
@@ -832,14 +831,7 @@ STORE(bch2_dev)
 	if (attr == &sysfs_durability) {
 		u64 v = strtoul_or_return(buf);
 
-		mutex_lock(&c->sb_lock);
-		mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
-
-		if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
-			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
-			bch2_write_super(c);
-		}
-		mutex_unlock(&c->sb_lock);
+		bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
 	}
 
 	if (attr == &sysfs_label) {

From d4dc0698e13d422ae64fda369f98b9ec13bd29c3 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Tue, 16 Jul 2024 10:58:16 +0800
Subject: [PATCH 17/61] bcachefs: allocate inode by using alloc_inode_sb()

The inode allocation is supposed to use alloc_inode_sb(), so convert
kmem_cache_alloc() to alloc_inode_sb().

It will also fix [1] to avoid the NULL pointer dereference BUG in
list_lru_add() when CONFIG_MEMCG is enabled.

Links:
[1]: https://lore.kernel.org/all/20589721-46c0-4344-b2ef-6ab48bbe2ea5@linux.dev/
[2]: https://lore.kernel.org/all/7db60e36-9c96-4938-a28d-a9745e287386@linux.dev/

Fixes: 86d81ec5f5f0 ("bcachefs: Mark bch_inode_info as SLAB_ACCOUNT")
Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index eca0b073a1cf..61dafc544e16 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -288,7 +288,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 
 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
 {
-	struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+	struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
+						bch2_inode_cache, GFP_NOFS);
 	if (!inode)
 		return NULL;
 

From 5ae1bfd89997026ce99448023714c5d3d79480de Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Wed, 3 Jul 2024 15:09:55 +0800
Subject: [PATCH 18/61] bcachefs: Mark bch_inode_info as SLAB_ACCOUNT

After commit 230e9fc28604 ("slab: add SLAB_ACCOUNT flag"), we need to mark
the inode cache as SLAB_ACCOUNT, similar to commit 5d097056c9a0 ("kmemcg:
account for certain kmem allocations to memcg")

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 61dafc544e16..f3bb20429271 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -2220,7 +2220,8 @@ int __init bch2_vfs_init(void)
 {
 	int ret = -ENOMEM;
 
-	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
+	bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
+				      SLAB_ACCOUNT);
 	if (!bch2_inode_cache)
 		goto err;
 

From c2a531b9d745fcac2e4d1010f3bf1f80ec69ff5a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 16 Jul 2024 17:23:10 -0400
Subject: [PATCH 19/61] bcachefs: Add check for btree_path ref overflow

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c   | 24 ++++++++++++------------
 fs/bcachefs/btree_iter.h   | 26 ++++++++++++++++++--------
 fs/bcachefs/btree_update.c |  2 +-
 3 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 2e84d22e17bd..95afa35833bb 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1010,9 +1010,9 @@ retry_all:
 		 * the same position:
 		 */
 		if (trans->paths[idx].uptodate) {
-			__btree_path_get(&trans->paths[idx], false);
+			__btree_path_get(trans, &trans->paths[idx], false);
 			ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
-			__btree_path_put(&trans->paths[idx], false);
+			__btree_path_put(trans, &trans->paths[idx], false);
 
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
 			    bch2_err_matches(ret, ENOMEM))
@@ -1225,7 +1225,7 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i
 {
 	btree_path_idx_t new = btree_path_alloc(trans, src);
 	btree_path_copy(trans, trans->paths + new, trans->paths + src);
-	__btree_path_get(trans->paths + new, intent);
+	__btree_path_get(trans, trans->paths + new, intent);
 #ifdef TRACK_PATH_ALLOCATED
 	trans->paths[new].ip_allocated = ip;
 #endif
@@ -1236,7 +1236,7 @@ __flatten
 btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
 			btree_path_idx_t path, bool intent, unsigned long ip)
 {
-	__btree_path_put(trans->paths + path, intent);
+	__btree_path_put(trans, trans->paths + path, intent);
 	path = btree_path_clone(trans, path, intent, ip);
 	trans->paths[path].preserve = false;
 	return path;
@@ -1361,7 +1361,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
 {
 	struct btree_path *path = trans->paths + path_idx, *dup;
 
-	if (!__btree_path_put(path, intent))
+	if (!__btree_path_put(trans, path, intent))
 		return;
 
 	dup = path->preserve
@@ -1392,7 +1392,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
 static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
 				 bool intent)
 {
-	if (!__btree_path_put(trans->paths + path, intent))
+	if (!__btree_path_put(trans, trans->paths + path, intent))
 		return;
 
 	__bch2_path_free(trans, path);
@@ -1716,14 +1716,14 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
 	    trans->paths[path_pos].cached	== cached &&
 	    trans->paths[path_pos].btree_id	== btree_id &&
 	    trans->paths[path_pos].level	== level) {
-		__btree_path_get(trans->paths + path_pos, intent);
+		__btree_path_get(trans, trans->paths + path_pos, intent);
 		path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
 		path = trans->paths + path_idx;
 	} else {
 		path_idx = btree_path_alloc(trans, path_pos);
 		path = trans->paths + path_idx;
 
-		__btree_path_get(path, intent);
+		__btree_path_get(trans, path, intent);
 		path->pos			= pos;
 		path->btree_id			= btree_id;
 		path->cached			= cached;
@@ -2326,7 +2326,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 			 * advance, same as on exit for iter->path, but only up
 			 * to snapshot
 			 */
-			__btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+			__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
 			iter->update_path = iter->path;
 
 			iter->update_path = bch2_btree_path_set_pos(trans,
@@ -2911,9 +2911,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 	dst->ip_allocated = _RET_IP_;
 #endif
 	if (src->path)
-		__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
+		__btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
 	if (src->update_path)
-		__btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
+		__btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
 	dst->key_cache_path = 0;
 }
 
@@ -3237,7 +3237,7 @@ void bch2_trans_put(struct btree_trans *trans)
 	bch2_trans_unlock(trans);
 
 	trans_for_each_update(trans, i)
-		__btree_path_put(trans->paths + i->path, true);
+		__btree_path_put(trans, trans->paths + i->path, true);
 	trans->nr_updates	= 0;
 
 	check_btree_paths_leaked(trans);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 222b7ce8a901..928993118983 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -6,6 +6,12 @@
 #include "btree_types.h"
 #include "trace.h"
 
+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
+void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
+void bch2_dump_trans_updates(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+
 static inline int __bkey_err(const struct bkey *k)
 {
 	return PTR_ERR_OR_ZERO(k);
@@ -13,16 +19,26 @@ static inline int __bkey_err(const struct bkey *k)
 
 #define bkey_err(_k)	__bkey_err((_k).k)
 
-static inline void __btree_path_get(struct btree_path *path, bool intent)
+static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
 {
+	unsigned idx = path - trans->paths;
+
+	EBUG_ON(!test_bit(idx, trans->paths_allocated));
+	if (unlikely(path->ref == U8_MAX)) {
+		bch2_dump_trans_paths_updates(trans);
+		panic("path %u refcount overflow\n", idx);
+	}
+
 	path->ref++;
 	path->intent_ref += intent;
 }
 
-static inline bool __btree_path_put(struct btree_path *path, bool intent)
+static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
 {
+	EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
 	EBUG_ON(!path->ref);
 	EBUG_ON(!path->intent_ref && intent);
+
 	path->intent_ref -= intent;
 	return --path->ref == 0;
 }
@@ -894,12 +910,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 	_ret;								\
 })
 
-void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
-void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
-void bch2_dump_trans_paths_updates(struct btree_trans *);
-
 struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
 void bch2_trans_put(struct btree_trans *);
 
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index d6f6df10dcc3..57610c7465b3 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -449,7 +449,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
 		}
 	}
 
-	__btree_path_get(trans->paths + i->path, true);
+	__btree_path_get(trans, trans->paths + i->path, true);
 
 	/*
 	 * If a key is present in the key cache, it must also exist in the

From b7e94521e76b07c65d45a0e8e69d306143296775 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 10 Aug 2022 19:57:46 -0400
Subject: [PATCH 20/61] bcachefs: Btree path tracepoints

Fastpath tracepoints, rarely needed, only enabled with
CONFIG_BCACHEFS_PATH_TRACEPOINTS.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/Kconfig                 |   7 +
 fs/bcachefs/btree_iter.c            |  38 ++-
 fs/bcachefs/btree_iter.h            |   2 +
 fs/bcachefs/btree_locking.h         |   6 +-
 fs/bcachefs/btree_update.c          |  10 +-
 fs/bcachefs/btree_update_interior.c |   2 +-
 fs/bcachefs/trace.h                 | 465 +++++++++++++++++++++++++++-
 7 files changed, 508 insertions(+), 22 deletions(-)

diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 5cdfef3b551a..5bac803ea367 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
 	is held by another thread, spin for a short while, as long as the
 	thread owning the lock is running.
 
+config BCACHEFS_PATH_TRACEPOINTS
+	bool "Extra btree_path tracepoints"
+	depends on BCACHEFS_FS
+	help
+	Enable extra tracepoints for debugging btree_path operations; we don't
+	normally want these enabled because they happen at very high rates.
+
 config MEAN_AND_VARIANCE_UNIT_TEST
 	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 95afa35833bb..c71efbdb3f07 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1131,6 +1131,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 	if (unlikely(!trans->srcu_held))
 		bch2_trans_srcu_lock(trans);
 
+	trace_btree_path_traverse_start(trans, path);
+
 	/*
 	 * Ensure we obey path->should_be_locked: if it's set, we can't unlock
 	 * and re-traverse the path without a transaction restart:
@@ -1194,6 +1196,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 
 out_uptodate:
 	path->uptodate = BTREE_ITER_UPTODATE;
+	trace_btree_path_traverse_end(trans, path);
 out:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
 		panic("ret %s (%i) trans->restarted %s (%i)\n",
@@ -1236,8 +1239,10 @@ __flatten
 btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
 			btree_path_idx_t path, bool intent, unsigned long ip)
 {
+	struct btree_path *old = trans->paths + path;
 	__btree_path_put(trans, trans->paths + path, intent);
 	path = btree_path_clone(trans, path, intent, ip);
+	trace_btree_path_clone(trans, old, trans->paths + path);
 	trans->paths[path].preserve = false;
 	return path;
 }
@@ -1252,6 +1257,8 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
 	bch2_trans_verify_not_in_restart(trans);
 	EBUG_ON(!trans->paths[path_idx].ref);
 
+	trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
+
 	path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
 
 	struct btree_path *path = trans->paths + path_idx;
@@ -1368,6 +1375,8 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
 		? have_path_at_pos(trans, path)
 		: have_node_at_pos(trans, path);
 
+	trace_btree_path_free(trans, path_idx, dup);
+
 	if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
 		return;
 
@@ -1421,8 +1430,8 @@ void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
 noinline __cold
 void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 {
-	prt_printf(buf, "transaction updates for %s journal seq %llu\n",
-	       trans->fn, trans->journal_res.seq);
+	prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
+		   trans->nr_updates, trans->fn, trans->journal_res.seq);
 	printbuf_indent_add(buf, 2);
 
 	trans_for_each_update(trans, i) {
@@ -1464,7 +1473,7 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
 {
 	struct btree_path *path = trans->paths + path_idx;
 
-	prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
+	prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
 		   path_idx, path->ref, path->intent_ref,
 		   path->preserve ? 'P' : ' ',
 		   path->should_be_locked ? 'S' : ' ',
@@ -1716,6 +1725,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
 	    trans->paths[path_pos].cached	== cached &&
 	    trans->paths[path_pos].btree_id	== btree_id &&
 	    trans->paths[path_pos].level	== level) {
+		trace_btree_path_get(trans, trans->paths + path_pos, &pos);
+
 		__btree_path_get(trans, trans->paths + path_pos, intent);
 		path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
 		path = trans->paths + path_idx;
@@ -1738,6 +1749,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
 		path->ip_allocated		= ip;
 #endif
 		trans->paths_sorted		= false;
+
+		trace_btree_path_alloc(trans, path);
 	}
 
 	if (!(flags & BTREE_ITER_nopreserve))
@@ -1857,7 +1870,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 
 	struct btree_path *path = btree_iter_path(trans, iter);
 	if (btree_path_node(path, path->level))
-		btree_path_set_should_be_locked(path);
+		btree_path_set_should_be_locked(trans, path);
 	return 0;
 }
 
@@ -1889,7 +1902,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
 					iter->flags & BTREE_ITER_intent,
 					btree_iter_ip_allocated(iter));
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
@@ -1983,7 +1996,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
 					iter->flags & BTREE_ITER_intent,
 					btree_iter_ip_allocated(iter));
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
 	EBUG_ON(btree_iter_path(trans, iter)->uptodate);
 out:
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -2155,7 +2168,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
 	if (unlikely(ret))
 		return bkey_s_c_err(ret);
 
-	btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+	btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
 
 	k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
 	if (k.k && !bkey_err(k)) {
@@ -2199,7 +2212,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
 			goto out;
 		}
 
-		btree_path_set_should_be_locked(path);
+		btree_path_set_should_be_locked(trans, path);
 
 		k = btree_path_level_peek_all(trans->c, l, &iter->k);
 
@@ -2382,14 +2395,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 				iter->flags & BTREE_ITER_intent,
 				btree_iter_ip_allocated(iter));
 
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
 out_no_locked:
 	if (iter->update_path) {
 		ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
 		if (unlikely(ret))
 			k = bkey_s_c_err(ret);
 		else
-			btree_path_set_should_be_locked(trans->paths + iter->update_path);
+			btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
 	}
 
 	if (!(iter->flags & BTREE_ITER_all_snapshots))
@@ -2511,6 +2524,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 								iter->flags & BTREE_ITER_intent,
 								_THIS_IP_);
 					path = btree_iter_path(trans, iter);
+					trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
 					saved_k = *k.k;
 					saved_v = k.v;
 				}
@@ -2527,7 +2541,7 @@ got_key:
 				continue;
 			}
 
-			btree_path_set_should_be_locked(path);
+			btree_path_set_should_be_locked(trans, path);
 			break;
 		} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
 			/* Advance to previous leaf node: */
@@ -2685,7 +2699,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 		}
 	}
 out:
-	btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+	btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
 out_no_locked:
 	bch2_btree_iter_verify_entry_exit(iter);
 	bch2_btree_iter_verify(iter);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 928993118983..172507b4dead 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -31,6 +31,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
 
 	path->ref++;
 	path->intent_ref += intent;
+	trace_btree_path_get_ll(trans, path);
 }
 
 static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
@@ -39,6 +40,7 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path
 	EBUG_ON(!path->ref);
 	EBUG_ON(!path->intent_ref && intent);
 
+	trace_btree_path_put_ll(trans, path);
 	path->intent_ref -= intent;
 	return --path->ref == 0;
 }
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 11a64ead8685..dd0a2a1aa2bc 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -228,6 +228,9 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 				 bch2_six_check_for_deadlock, trans, ip);
 	WRITE_ONCE(trans->locking, NULL);
 	WRITE_ONCE(trans->locking_wait.start_time, 0);
+
+	if (!ret)
+		trace_btree_path_lock(trans, _THIS_IP_, b);
 	return ret;
 }
 
@@ -400,12 +403,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
 
 /* misc: */
 
-static inline void btree_path_set_should_be_locked(struct btree_path *path)
+static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
 {
 	EBUG_ON(!btree_node_locked(path, path->level));
 	EBUG_ON(path->uptodate);
 
 	path->should_be_locked = true;
+	trace_btree_path_should_be_locked(trans, path);
 }
 
 static inline void __btree_path_set_level_up(struct btree_trans *trans,
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 57610c7465b3..514df618548e 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -374,7 +374,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
 	i->key_cache_already_flushed = true;
 	i->flags |= BTREE_TRIGGER_norun;
 
-	btree_path_set_should_be_locked(btree_path);
+	btree_path_set_should_be_locked(trans, btree_path);
 	ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
 out:
 	bch2_path_put(trans, path_idx, true);
@@ -422,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
 			break;
 	}
 
-	if (!cmp && i < trans->updates + trans->nr_updates) {
+	bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
+
+	if (overwrite) {
 		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
 		bch2_path_put(trans, i->path, true);
@@ -451,6 +453,8 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
 
 	__btree_path_get(trans, trans->paths + i->path, true);
 
+	trace_update_by_path(trans, path, i, overwrite);
+
 	/*
 	 * If a key is present in the key cache, it must also exist in the
 	 * btree - this is necessary for cache coherency. When iterating over
@@ -498,7 +502,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
 			return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 		}
 
-		btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
+		btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
 	}
 
 	return 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 8fd112026e7a..396d052474bb 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1981,7 +1981,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	btree_path_set_should_be_locked(trans->paths + sib_path);
+	btree_path_set_should_be_locked(trans, trans->paths + sib_path);
 
 	m = trans->paths[sib_path].l[level].b;
 
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index c62f00322d1e..5597b9d6297f 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -3,7 +3,6 @@
 #define TRACE_SYSTEM bcachefs
 
 #if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_BCACHEFS_H
 
 #include <linux/tracepoint.h>
 
@@ -558,6 +557,7 @@ TRACE_EVENT(btree_path_relock_fail,
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		__field(u8,			level		)
+		__field(u8,			path_idx)
 		TRACE_BPOS_entries(pos)
 		__array(char,			node, 24	)
 		__field(u8,			self_read_count	)
@@ -575,7 +575,8 @@ TRACE_EVENT(btree_path_relock_fail,
 		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
-		__entry->level			= path->level;
+		__entry->level			= level;
+		__entry->path_idx		= path - trans->paths;
 		TRACE_BPOS_assign(pos, path->pos);
 
 		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
@@ -588,7 +589,7 @@ TRACE_EVENT(btree_path_relock_fail,
 			c = six_lock_counts(&path->l[level].b->c.lock);
 			__entry->read_count	= c.n[SIX_LOCK_read];
 			__entry->intent_count	= c.n[SIX_LOCK_intent];
-			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+			scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
 		}
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
 		__entry->node_lock_seq		= is_btree_node(path, level)
@@ -596,9 +597,10 @@ TRACE_EVENT(btree_path_relock_fail,
 			: 0;
 	),
 
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
+	TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
+		  __entry->path_idx,
 		  bch2_btree_id_str(__entry->btree_id),
 		  __entry->pos_inode,
 		  __entry->pos_offset,
@@ -625,6 +627,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		__field(unsigned long,		caller_ip	)
 		__field(u8,			btree_id	)
 		__field(u8,			level		)
+		__field(u8,			path_idx)
 		TRACE_BPOS_entries(pos)
 		__field(u8,			locked		)
 		__field(u8,			self_read_count	)
@@ -642,6 +645,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		__entry->caller_ip		= caller_ip;
 		__entry->btree_id		= path->btree_id;
 		__entry->level			= level;
+		__entry->path_idx		= path - trans->paths;
 		TRACE_BPOS_assign(pos, path->pos);
 		__entry->locked			= btree_node_locked(path, level);
 
@@ -657,9 +661,10 @@ TRACE_EVENT(btree_path_upgrade_fail,
 			: 0;
 	),
 
-	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
+	TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
 		  __entry->trans_fn,
 		  (void *) __entry->caller_ip,
+		  __entry->path_idx,
 		  bch2_btree_id_str(__entry->btree_id),
 		  __entry->pos_inode,
 		  __entry->pos_offset,
@@ -1438,6 +1443,456 @@ TRACE_EVENT(error_downcast,
 	TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
 );
 
+#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
+
+TRACE_EVENT(update_by_path,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path,
+		 struct btree_insert_entry *i, bool overwrite),
+	TP_ARGS(trans, path, i, overwrite),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(btree_path_idx_t,	path_idx	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			overwrite	)
+		__field(btree_path_idx_t,	update_idx	)
+		__field(btree_path_idx_t,	nr_updates	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->path_idx		= path - trans->paths;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+		__entry->overwrite		= overwrite;
+		__entry->update_idx		= i - trans->updates;
+		__entry->nr_updates		= trans->nr_updates;
+	),
+
+	TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
+		  __entry->trans_fn,
+		  __entry->path_idx,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->overwrite,
+		  __entry->update_idx,
+		  __entry->nr_updates)
+);
+
+TRACE_EVENT(btree_path_lock,
+	TP_PROTO(struct btree_trans *trans,
+		 unsigned long caller_ip,
+		 struct btree_bkey_cached_common *b),
+	TP_ARGS(trans, caller_ip, b),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		__array(char,			node, 24	)
+		__field(u32,			lock_seq	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->btree_id		= b->btree_id;
+		__entry->level			= b->level;
+
+		scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+		__entry->lock_seq		= six_lock_seq(&b->lock);
+	),
+
+	TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
+		  __entry->trans_fn,
+		  (void *) __entry->caller_ip,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->level,
+		  __entry->node,
+		  __entry->lock_seq)
+);
+
+DECLARE_EVENT_CLASS(btree_path_ev,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path),
+
+	TP_STRUCT__entry(
+		__field(u16,			idx		)
+		__field(u8,			ref		)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
+		  __entry->idx, __entry->ref,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_alloc,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			locks_want	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->locks_want		= path->locks_want;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
+		  __entry->idx,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->locks_want,
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot)
+);
+
+TRACE_EVENT(btree_path_get,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
+	TP_ARGS(trans, path, new_pos),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		__field(u8,			locks_want	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(old_pos)
+		TRACE_BPOS_entries(new_pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		__entry->locks_want		= path->locks_want;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(old_pos, path->pos);
+		TRACE_BPOS_assign(new_pos, *new_pos);
+	),
+
+	TP_printk("    path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->locks_want,
+		  __entry->old_pos_inode,
+		  __entry->old_pos_offset,
+		  __entry->old_pos_snapshot,
+		  __entry->new_pos_inode,
+		  __entry->new_pos_offset,
+		  __entry->new_pos_snapshot)
+);
+
+DECLARE_EVENT_CLASS(btree_path_clone,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+	TP_ARGS(trans, path, new),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			new_idx		)
+		__field(u8,			btree_id	)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		TRACE_BPOS_entries(pos)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->new_idx		= new - trans->paths;
+		__entry->btree_id		= path->btree_id;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		TRACE_BPOS_assign(pos, path->pos);
+	),
+
+	TP_printk("  path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->new_idx)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_clone,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+	TP_ARGS(trans, path, new)
+);
+
+DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
+	TP_ARGS(trans, path, new)
+);
+
+DECLARE_EVENT_CLASS(btree_path_traverse,
+	TP_PROTO(struct btree_trans *trans,
+		 struct btree_path *path),
+	TP_ARGS(trans, path),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		__field(u8,			should_be_locked )
+		__field(u8,			btree_id	)
+		__field(u8,			level		)
+		TRACE_BPOS_entries(pos)
+		__field(u8,			locks_want	)
+		__field(u8,			nodes_locked	)
+		__array(char,			node0, 24	)
+		__array(char,			node1, 24	)
+		__array(char,			node2, 24	)
+		__array(char,			node3, 24	)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		__entry->btree_id		= path->btree_id;
+		__entry->level			= path->level;
+		TRACE_BPOS_assign(pos, path->pos);
+
+		__entry->locks_want		= path->locks_want;
+		__entry->nodes_locked		= path->nodes_locked;
+		struct btree *b = path->l[0].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[1].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[2].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[3].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+	),
+
+	TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
+		  "locks %u %u %u %u node %s %s %s %s",
+		  __entry->trans_fn,
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->pos_inode,
+		  __entry->pos_offset,
+		  __entry->pos_snapshot,
+		  __entry->level,
+		  __entry->locks_want,
+		  (__entry->nodes_locked >> 6) & 3,
+		  (__entry->nodes_locked >> 4) & 3,
+		  (__entry->nodes_locked >> 2) & 3,
+		  (__entry->nodes_locked >> 0) & 3,
+		  __entry->node3,
+		  __entry->node2,
+		  __entry->node1,
+		  __entry->node0)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
+	TP_PROTO(struct btree_trans *trans,
+		 struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
+	TP_PROTO(struct btree_trans *trans, struct btree_path *path),
+	TP_ARGS(trans, path)
+);
+
+TRACE_EVENT(btree_path_set_pos,
+	TP_PROTO(struct btree_trans *trans,
+		 struct btree_path *path,
+		 struct bpos *new_pos),
+	TP_ARGS(trans, path, new_pos),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			ref		)
+		__field(u8,			preserve	)
+		__field(u8,			btree_id	)
+		TRACE_BPOS_entries(old_pos)
+		TRACE_BPOS_entries(new_pos)
+		__field(u8,			locks_want	)
+		__field(u8,			nodes_locked	)
+		__array(char,			node0, 24	)
+		__array(char,			node1, 24	)
+		__array(char,			node2, 24	)
+		__array(char,			node3, 24	)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path - trans->paths;
+		__entry->ref			= path->ref;
+		__entry->preserve		= path->preserve;
+		__entry->btree_id		= path->btree_id;
+		TRACE_BPOS_assign(old_pos, path->pos);
+		TRACE_BPOS_assign(new_pos, *new_pos);
+
+		__entry->nodes_locked		= path->nodes_locked;
+		struct btree *b = path->l[0].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[1].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[2].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
+		b = path->l[3].b;
+		if (IS_ERR(b))
+			strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
+		else
+			scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
+	),
+
+	TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
+		  "locks %u %u %u %u node %s %s %s %s",
+		  __entry->idx,
+		  __entry->ref,
+		  __entry->preserve,
+		  bch2_btree_id_str(__entry->btree_id),
+		  __entry->old_pos_inode,
+		  __entry->old_pos_offset,
+		  __entry->old_pos_snapshot,
+		  __entry->new_pos_inode,
+		  __entry->new_pos_offset,
+		  __entry->new_pos_snapshot,
+		  (__entry->nodes_locked >> 6) & 3,
+		  (__entry->nodes_locked >> 4) & 3,
+		  (__entry->nodes_locked >> 2) & 3,
+		  (__entry->nodes_locked >> 0) & 3,
+		  __entry->node3,
+		  __entry->node2,
+		  __entry->node1,
+		  __entry->node0)
+);
+
+TRACE_EVENT(btree_path_free,
+	TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
+	TP_ARGS(trans, path, dup),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+		__field(u8,			preserve	)
+		__field(u8,			should_be_locked)
+		__field(s8,			dup		)
+		__field(u8,			dup_locked	)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path;
+		__entry->preserve		= trans->paths[path].preserve;
+		__entry->should_be_locked	= trans->paths[path].should_be_locked;
+		__entry->dup			= dup ? dup - trans->paths  : -1;
+		__entry->dup_locked		= dup ? btree_node_locked(dup, dup->level) : 0;
+	),
+
+	TP_printk("   path %3u %c %c dup %2i locked %u", __entry->idx,
+		  __entry->preserve ? 'P' : ' ',
+		  __entry->should_be_locked ? 'S' : ' ',
+		  __entry->dup,
+		  __entry->dup_locked)
+);
+
+TRACE_EVENT(btree_path_free_trans_begin,
+	TP_PROTO(btree_path_idx_t path),
+	TP_ARGS(path),
+
+	TP_STRUCT__entry(
+		__field(btree_path_idx_t,	idx		)
+	),
+
+	TP_fast_assign(
+		__entry->idx			= path;
+	),
+
+	TP_printk("   path %3u", __entry->idx)
+);
+
+#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+#ifndef _TRACE_BCACHEFS_H
+
+static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
+					struct btree_insert_entry *i, bool overwrite) {}
+static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
+static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
+static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
+static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
+static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
+static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
+
+#endif
+#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
+
+#define _TRACE_BCACHEFS_H
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */

From e6dc5839a5067214a469e7c7d080b22010233700 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 11:42:11 -0400
Subject: [PATCH 21/61] bcachefs: kill bch2_btree_iter_peek_and_restart()

dead code

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c |  1 +
 fs/bcachefs/btree_iter.h | 14 --------------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index c71efbdb3f07..bfe9f0c1e1be 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2726,6 +2726,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
 	return bch2_btree_iter_peek_slot(iter);
 }
 
+/* Obsolete, but still used by rust wrapper in -tools */
 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
 {
 	struct bkey_s_c k;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 172507b4dead..2ad90f40325a 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -832,20 +832,6 @@ transaction_restart:							\
 
 struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
-				   struct btree_iter *iter, unsigned flags)
-{
-	struct bkey_s_c k;
-
-	while (btree_trans_too_many_iters(trans) ||
-	       (k = bch2_btree_iter_peek_type(iter, flags),
-		bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
-		bch2_trans_begin(trans);
-
-	return k;
-}
-
 #define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
 			   _start, _end, _flags, _k, _ret)		\
 	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\

From d338bc2d8f41bcda244c22ccfcc0c23f06f7f5f5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 11:47:01 -0400
Subject: [PATCH 22/61] bcachefs: bchfs_read(): call trans_begin() on every
 loop iter

Same as the recent change for __bch2_read(); also, kill now unnecessary
btree_trans_too_many_iters() calls.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 39 +++++++++++++-----------------------
 fs/bcachefs/io_read.c        |  4 ----
 2 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 975a63063629..f5cff824b79f 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
 	struct bkey_buf sk;
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
-	u32 snapshot;
 	int ret = 0;
 
 	rbio->c = c;
@@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans,
 	rbio->subvol = inum.subvol;
 
 	bch2_bkey_buf_init(&sk);
-retry:
 	bch2_trans_begin(trans);
-	iter = (struct btree_iter) { NULL };
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+			     POS(inum.inum, rbio->bio.bi_iter.bi_sector),
 			     BTREE_ITER_slots);
 	while (1) {
 		struct bkey_s_c k;
 		unsigned bytes, sectors, offset_into_extent;
 		enum btree_id data_btree = BTREE_ID_extents;
 
-		/*
-		 * read_extent -> io_time_reset may cause a transaction restart
-		 * without returning an error, we need to check for that here:
-		 */
-		ret = bch2_trans_relock(trans);
+		bch2_trans_begin(trans);
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 		if (ret)
-			break;
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
 
 		bch2_btree_iter_set_pos(&iter,
 				POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@@ -189,7 +182,7 @@ retry:
 		k = bch2_btree_iter_peek_slot(&iter);
 		ret = bkey_err(k);
 		if (ret)
-			break;
+			goto err;
 
 		offset_into_extent = iter.pos.offset -
 			bkey_start_offset(k.k);
@@ -200,7 +193,7 @@ retry:
 		ret = bch2_read_indirect_extent(trans, &data_btree,
 					&offset_into_extent, &sk);
 		if (ret)
-			break;
+			goto err;
 
 		k = bkey_i_to_s_c(sk.k);
 
@@ -210,7 +203,7 @@ retry:
 			ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
 						  extent_partial_reads_expensive(k));
 			if (ret)
-				break;
+				goto err;
 		}
 
 		bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@@ -229,17 +222,13 @@ retry:
 
 		swap(rbio->bio.bi_iter.bi_size, bytes);
 		bio_advance(&rbio->bio, bytes);
-
-		ret = btree_trans_too_many_iters(trans);
-		if (ret)
+err:
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			break;
 	}
-err:
 	bch2_trans_iter_exit(trans, &iter);
 
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
 	if (ret) {
 		bch_err_inum_offset_ratelimited(c,
 				iter.pos.inode,
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 7ee3b75480df..808017873d07 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -1214,10 +1214,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
 		swap(bvec_iter.bi_size, bytes);
 		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
-		ret = btree_trans_too_many_iters(trans);
-		if (ret)
-			goto err;
 err:
 		if (ret &&
 		    !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&

From acae3555f2bd080ae9f1f8c8bc7fe96fb992d83d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 11:50:54 -0400
Subject: [PATCH 23/61] bcachefs: bch2_fiemap(): call trans_begin() on every
 loop iter

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index f3bb20429271..7a9c164cbfa3 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1090,7 +1090,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	struct bkey_buf cur, prev;
 	unsigned offset_into_extent, sectors;
 	bool have_extent = false;
-	u32 snapshot;
 	int ret = 0;
 
 	ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -1106,21 +1105,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 	bch2_bkey_buf_init(&cur);
 	bch2_bkey_buf_init(&prev);
 	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
-	if (ret)
-		goto err;
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(ei->v.i_ino, start, snapshot), 0);
+			     POS(ei->v.i_ino, start), 0);
 
-	while (!(ret = btree_trans_too_many_iters(trans)) &&
-	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
-	       !(ret = bkey_err(k))) {
+	while (true) {
 		enum btree_id data_btree = BTREE_ID_extents;
 
+		bch2_trans_begin(trans);
+
+		u32 snapshot;
+		ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
+		if (ret)
+			goto err;
+
+		bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+		k = bch2_btree_iter_peek_upto(&iter, end);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (!k.k)
+			break;
+
 		if (!bkey_extent_is_data(k.k) &&
 		    k.k->type != KEY_TYPE_reservation) {
 			bch2_btree_iter_advance(&iter);
@@ -1164,16 +1172,12 @@ retry:
 
 		bch2_btree_iter_set_pos(&iter,
 			POS(iter.pos.inode, iter.pos.offset + sectors));
-
-		ret = bch2_trans_relock(trans);
-		if (ret)
+err:
+		if (ret &&
+		    !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 			break;
 	}
-	start = iter.pos.offset;
 	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
 
 	if (!ret && have_extent) {
 		bch2_trans_unlock(trans);

From 58ba437dc53028774116e5d857d5760216496552 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 12:59:51 -0400
Subject: [PATCH 24/61] bcachefs: for_each_btree_key_in_subvolume_upto()

New helper for looping over keys in a given subvolume

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/subvolume.h | 45 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a8299ba2cab2..e62f876541fe 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -31,6 +31,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
 int bch2_subvol_is_ro(struct bch_fs *, u32);
 
+static inline struct bkey_s_c
+bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
+					    u32 subvolid, unsigned flags)
+{
+	u32 snapshot;
+	int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+	if (ret)
+		return bkey_s_c_err(ret);
+
+	bch2_btree_iter_set_snapshot(iter, snapshot);
+	return bch2_btree_iter_peek_upto_type(iter, end, flags);
+}
+
+#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter,		\
+					 _end, _subvolid, _flags, _k, _do)	\
+({										\
+	struct bkey_s_c _k;							\
+	int _ret3 = 0;								\
+										\
+	do {									\
+		_ret3 = lockrestart_do(_trans, ({				\
+			(_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter),	\
+						_end, _subvolid, (_flags));	\
+			if (!(_k).k)						\
+				break;						\
+										\
+			bkey_err(_k) ?: (_do);					\
+		}));								\
+	} while (!_ret3 && bch2_btree_iter_advance(&(_iter)));			\
+										\
+	bch2_trans_iter_exit((_trans), &(_iter));				\
+	_ret3;									\
+})
+
+#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id,		\
+				_start, _end, _subvolid, _flags, _k, _do)	\
+({										\
+	struct btree_iter _iter;						\
+	bch2_trans_iter_init((_trans), &(_iter), (_btree_id),			\
+			     (_start), (_flags));				\
+										\
+	for_each_btree_key_in_subvolume_upto_continue(_trans, _iter,		\
+					_end, _subvolid, _flags, _k, _do);	\
+})
+
 int bch2_delete_dead_snapshots(struct bch_fs *);
 void bch2_delete_dead_snapshots_async(struct bch_fs *);
 

From 0e1f4e9d9a88b4f4dd55161fb8d1ec725f4f6101 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:24:28 -0400
Subject: [PATCH 25/61] bcachefs: bch2_readdir() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/dirent.c | 66 ++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 49 deletions(-)

diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 32bfdf19289a..84dd4a879d98 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -552,62 +552,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	subvol_inum target;
-	u32 snapshot;
 	struct bkey_buf sk;
-	int ret;
-
 	bch2_bkey_buf_init(&sk);
-retry:
-	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
+				   POS(inum.inum, ctx->pos),
+				   POS(inum.inum, U64_MAX),
+				   inum.subvol, 0, k, ({
+			if (k.k->type != KEY_TYPE_dirent)
+				continue;
 
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
-			   SPOS(inum.inum, ctx->pos, snapshot),
-			   POS(inum.inum, U64_MAX), 0, k, ret) {
-		if (k.k->type != KEY_TYPE_dirent)
-			continue;
+			/* dir_emit() can fault and block: */
+			bch2_bkey_buf_reassemble(&sk, c, k);
+			struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
 
-		/* dir_emit() can fault and block: */
-		bch2_bkey_buf_reassemble(&sk, c, k);
-		struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
+			subvol_inum target;
+			int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+			if (ret2 > 0)
+				continue;
 
-		ret = bch2_dirent_read_target(trans, inum, dirent, &target);
-		if (ret < 0)
-			break;
-		if (ret)
-			continue;
+			ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+		})));
 
-		/*
-		 * read_target looks up subvolumes, we can overflow paths if the
-		 * directory has many subvolumes in it
-		 *
-		 * XXX: btree_trans_too_many_iters() is something we'd like to
-		 * get rid of, and there's no good reason to be using it here
-		 * except that we don't yet have a for_each_btree_key() helper
-		 * that does subvolume_get_snapshot().
-		 */
-		ret =   drop_locks_do(trans,
-				bch2_dir_emit(ctx, dirent, target)) ?:
-			btree_trans_too_many_iters(trans);
-		if (ret) {
-			ret = ret < 0 ? ret : 0;
-			break;
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&sk, c);
 
-	return ret;
+	return ret < 0 ? ret : 0;
 }

From d01ee6d02c0eb52513d4294b2bc267b2b25bdf47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:24:28 -0400
Subject: [PATCH 26/61] bcachefs: bch2_xattr_list() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 55 ++++++++++-----------------------------------
 1 file changed, 12 insertions(+), 43 deletions(-)

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 4b5898d3eda0..9f810fb36ba1 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -295,54 +295,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
 	struct bch_fs *c = dentry->d_sb->s_fs_info;
 	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
 	u64 offset = 0, inum = inode->ei_inode.bi_inum;
-	u32 snapshot;
-	int ret;
-retry:
-	bch2_trans_begin(trans);
-	iter = (struct btree_iter) { NULL };
 
-	ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
-	if (ret)
-		goto err;
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
+				   POS(inum, offset),
+				   POS(inum, U64_MAX),
+				   inode->ei_inum.subvol, 0, k, ({
+			if (k.k->type != KEY_TYPE_xattr)
+				continue;
 
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
-			   SPOS(inum, offset, snapshot),
-			   POS(inum, U64_MAX), 0, k, ret) {
-		if (k.k->type != KEY_TYPE_xattr)
-			continue;
+			bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
+		}))) ?:
+		bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
+		bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
 
-		ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
-		if (ret)
-			break;
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-
-	if (ret)
-		goto out;
-
-	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
-	if (ret)
-		goto out;
-
-	ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
-	if (ret)
-		goto out;
-
-	return buf.used;
-out:
-	return bch2_err_class(ret);
+	return ret ? bch2_err_class(ret) : buf.used;
 }
 
 static int bch2_xattr_get_handler(const struct xattr_handler *handler,

From 45aab77b799a45240edf3b122d36ae1d8a248e18 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:26:54 -0400
Subject: [PATCH 27/61] bcachefs: bch2_seek_data() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 41 ++++++++++++-----------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a5018cbdb376..2c49051308a2 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -942,42 +942,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	subvol_inum inum = inode_inum(inode);
 	u64 isize, next_data = MAX_LFS_FILESIZE;
-	u32 snapshot;
-	int ret;
 
 	isize = i_size_read(&inode->v);
 	if (offset >= isize)
 		return -ENXIO;
 
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
-			   POS(inode->v.i_ino, U64_MAX),
-			   0, k, ret) {
-		if (bkey_extent_is_data(k.k)) {
-			next_data = max(offset, bkey_start_offset(k.k) << 9);
-			break;
-		} else if (k.k->p.offset >> 9 > isize)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+				   POS(inode->v.i_ino, offset >> 9),
+				   POS(inode->v.i_ino, U64_MAX),
+				   inum.subvol, 0, k, ({
+			if (bkey_extent_is_data(k.k)) {
+				next_data = max(offset, bkey_start_offset(k.k) << 9);
+				break;
+			} else if (k.k->p.offset >> 9 > isize)
+				break;
+			0;
+		})));
 	if (ret)
 		return ret;
 

From 0d0687f9158eae0d0a013dd154f3a0e1be0c9c76 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:28:23 -0400
Subject: [PATCH 28/61] bcachefs: bch2_seek_hole() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 54 ++++++++++++++++-----------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 2c49051308a2..b0ecc756d766 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -978,50 +978,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
 	struct bch_inode_info *inode = file_bch_inode(file);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
 	subvol_inum inum = inode_inum(inode);
 	u64 isize, next_hole = MAX_LFS_FILESIZE;
-	u32 snapshot;
-	int ret;
 
 	isize = i_size_read(&inode->v);
 	if (offset >= isize)
 		return -ENXIO;
 
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inode->v.i_ino, offset >> 9, snapshot),
-			   BTREE_ITER_slots, k, ret) {
-		if (k.k->p.inode != inode->v.i_ino) {
-			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					offset, MAX_LFS_FILESIZE, 0, false);
-			break;
-		} else if (!bkey_extent_is_data(k.k)) {
-			next_hole = bch2_seek_pagecache_hole(&inode->v,
-					max(offset, bkey_start_offset(k.k) << 9),
-					k.k->p.offset << 9, 0, false);
-
-			if (next_hole < k.k->p.offset << 9)
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+				   POS(inode->v.i_ino, offset >> 9),
+				   POS(inode->v.i_ino, U64_MAX),
+				   inum.subvol, BTREE_ITER_slots, k, ({
+			if (k.k->p.inode != inode->v.i_ino) {
+				next_hole = bch2_seek_pagecache_hole(&inode->v,
+						offset, MAX_LFS_FILESIZE, 0, false);
 				break;
-		} else {
-			offset = max(offset, bkey_start_offset(k.k) << 9);
-		}
-	}
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
+			} else if (!bkey_extent_is_data(k.k)) {
+				next_hole = bch2_seek_pagecache_hole(&inode->v,
+						max(offset, bkey_start_offset(k.k) << 9),
+						k.k->p.offset << 9, 0, false);
 
-	bch2_trans_put(trans);
+				if (next_hole < k.k->p.offset << 9)
+					break;
+			} else {
+				offset = max(offset, bkey_start_offset(k.k) << 9);
+			}
+			0;
+		})));
 	if (ret)
 		return ret;
 

From 0cf1808c3d999bad83e194c8c4a14a1215c36d92 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:30:23 -0400
Subject: [PATCH 29/61] bcachefs: range_has_data() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b0ecc756d766..4359fde86603 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
 				 struct bpos start,
 				 struct bpos end)
 {
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	int ret = 0;
-retry:
-	bch2_trans_begin(trans);
-
-	ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
-	if (ret)
-		goto err;
-
-	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
-		if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
-			ret = 1;
-			break;
-		}
-	start = iter.pos;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-	return ret;
+	return bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
+						    subvol, 0, k, ({
+			bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
+		})));
 }
 
 static int __bch2_truncate_folio(struct bch_inode_info *inode,

From f26185feac8d28c6f827e9557902ec89a5585a40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:34:35 -0400
Subject: [PATCH 30/61] bcachefs: bch2_folio_set() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-pagecache.c | 78 +++++++++++++----------------------
 1 file changed, 29 insertions(+), 49 deletions(-)

diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index a9cc5cad9cc9..af3a24546aa3 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
 int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 		   struct folio **fs, unsigned nr_folios)
 {
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_folio *s;
 	u64 offset = folio_sector(fs[0]);
-	unsigned folio_idx;
-	u32 snapshot;
 	bool need_set = false;
-	int ret;
 
-	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-		s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+	for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+		struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
 		if (!s)
 			return -ENOMEM;
 
@@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
 	if (!need_set)
 		return 0;
 
-	folio_idx = 0;
-	trans = bch2_trans_get(c);
-retry:
-	bch2_trans_begin(trans);
+	unsigned folio_idx = 0;
 
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		goto err;
+	return bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
+				   POS(inum.inum, offset),
+				   POS(inum.inum, U64_MAX),
+				   inum.subvol, BTREE_ITER_slots, k, ({
+			unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+			unsigned state = bkey_to_sector_state(k);
 
-	for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
-			   SPOS(inum.inum, offset, snapshot),
-			   BTREE_ITER_slots, k, ret) {
-		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-		unsigned state = bkey_to_sector_state(k);
+			while (folio_idx < nr_folios) {
+				struct folio *folio = fs[folio_idx];
+				u64 folio_start	= folio_sector(folio);
+				u64 folio_end	= folio_end_sector(folio);
+				unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+					folio_start;
+				unsigned folio_len = min(k.k->p.offset, folio_end) -
+					folio_offset - folio_start;
 
-		while (folio_idx < nr_folios) {
-			struct folio *folio = fs[folio_idx];
-			u64 folio_start	= folio_sector(folio);
-			u64 folio_end	= folio_end_sector(folio);
-			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
-				folio_start;
-			unsigned folio_len = min(k.k->p.offset, folio_end) -
-				folio_offset - folio_start;
+				BUG_ON(k.k->p.offset < folio_start);
+				BUG_ON(bkey_start_offset(k.k) > folio_end);
 
-			BUG_ON(k.k->p.offset < folio_start);
-			BUG_ON(bkey_start_offset(k.k) > folio_end);
+				if (!bch2_folio(folio)->uptodate)
+					__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
 
-			if (!bch2_folio(folio)->uptodate)
-				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+				if (k.k->p.offset < folio_end)
+					break;
+				folio_idx++;
+			}
 
-			if (k.k->p.offset < folio_end)
+			if (folio_idx == nr_folios)
 				break;
-			folio_idx++;
-		}
-
-		if (folio_idx == nr_folios)
-			break;
-	}
-
-	offset = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-	bch2_trans_put(trans);
-
-	return ret;
+			0;
+		})));
 }
 
 void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)

From a054c03dcac3128d6d9330f270a64d64117cf813 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 17 Jul 2024 13:24:28 -0400
Subject: [PATCH 31/61] bcachefs: quota_reserve_range() ->
 for_each_btree_key_in_subvolume_upto

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io.c | 46 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 4359fde86603..71d0fa387509 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -794,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
 			       u64 start, u64 end)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u32 snapshot;
 	u64 sectors = end - start;
-	u64 pos = start;
-	int ret;
-retry:
-	bch2_trans_begin(trans);
 
-	ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
-	if (ret)
-		goto err;
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_in_subvolume_upto(trans, iter,
+				BTREE_ID_extents,
+				POS(inode->v.i_ino, start),
+				POS(inode->v.i_ino, end - 1),
+				inode->ei_inum.subvol, 0, k, ({
+			if (bkey_extent_is_allocation(k.k)) {
+				u64 s = min(end, k.k->p.offset) -
+					max(start, bkey_start_offset(k.k));
+				BUG_ON(s > sectors);
+				sectors -= s;
+			}
 
-	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-			     SPOS(inode->v.i_ino, pos, snapshot), 0);
-
-	while (!(ret = btree_trans_too_many_iters(trans)) &&
-	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
-	       !(ret = bkey_err(k))) {
-		if (bkey_extent_is_allocation(k.k)) {
-			u64 s = min(end, k.k->p.offset) -
-				max(start, bkey_start_offset(k.k));
-			BUG_ON(s > sectors);
-			sectors -= s;
-		}
-		bch2_btree_iter_advance(&iter);
-	}
-	pos = iter.pos.offset;
-	bch2_trans_iter_exit(trans, &iter);
-err:
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
+			0;
+		})));
 
 	return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
 }

From a785c5668a80f774cc053500c5336a8696ef97de Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Sun, 21 Jul 2024 08:43:24 -0400
Subject: [PATCH 32/61] bcachefs: remove the unused macro definition

macro bch2_kthread_wait_event_ioclock_timeout is no longer used,
let's remove it.

The patch has passed compilation test.

Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/clock.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 85c975dfbcfe..82c79c8baf92 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
 
 void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
 
-#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
-({									\
-	long __ret = timeout;						\
-	might_sleep();							\
-	if (!___wait_cond_timeout(condition))				\
-		__ret = __wait_event_timeout(wq, condition, timeout);	\
-	__ret;								\
-})
-
 void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
 
 void bch2_io_clock_exit(struct io_clock *);

From f9eac8473646d0374502994f00c73b7c10517a69 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Sun, 21 Jul 2024 08:44:24 -0400
Subject: [PATCH 33/61] bcachefs: fix macro definition
 allocate_dropping_locks_errcode

The macro allocate_dropping_locks_errocode accepts a parameter _trans,
but it was not used, rather the variable trans was directly used,
which may be a local variable inside a function that calls the macros.

Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2ad90f40325a..140c7b831233 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -872,7 +872,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 									\
 	if (bch2_err_matches(_ret, ENOMEM)) {				\
 		_gfp = GFP_KERNEL;					\
-		_ret = drop_locks_do(trans, _do);			\
+		_ret = drop_locks_do(_trans, _do);			\
 	}								\
 	_ret;								\
 })

From 19c68587a372f57fbb27c86d4331d862727bf0a9 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Sun, 21 Jul 2024 08:45:47 -0400
Subject: [PATCH 34/61] bcachefs: fix macro definition allocate_dropping_locks

The macro allocate_dropping_locks accepts a parameter _trans,
but it was not used, rather the variable trans was directly used,
which may be a local variable inside a function that calls the macros.

Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 140c7b831233..6d87e57745da 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -885,7 +885,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 	_ret = 0;							\
 	if (unlikely(!_p)) {						\
 		_gfp = GFP_KERNEL;					\
-		_ret = drop_locks_do(trans, ((_p = _do), 0));		\
+		_ret = drop_locks_do(_trans, ((_p = _do), 0));		\
 	}								\
 	_p;								\
 })

From e84f1802da69d23cfd62322d00775398e84baef1 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Sun, 21 Jul 2024 08:55:20 -0400
Subject: [PATCH 35/61] bcachefs: remove the unused parameter in macro
 bkey_crc_next

In the macro definition of bkey_crc_next, five parameters
were accepted, but only four of them were used. Let's remove
the unused one.

The patch has only passed compilation tests, but it should be fine.

Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 709dd83183be..8e1ba46fb0ba 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -357,7 +357,7 @@ out:									\
 	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
 				   _ptr, _entry)
 
-#define bkey_crc_next(_k, _start, _end, _crc, _iter)			\
+#define bkey_crc_next(_k, _end, _crc, _iter)			\
 ({									\
 	__bkey_extent_entry_for_each_from(_iter, _end, _iter)		\
 		if (extent_entry_is_crc(_iter)) {			\
@@ -372,7 +372,7 @@ out:									\
 #define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)		\
 	for ((_crc) = bch2_extent_crc_unpack(_k, NULL),			\
 	     (_iter) = (_start);					\
-	     bkey_crc_next(_k, _start, _end, _crc, _iter);		\
+	     bkey_crc_next(_k, _end, _crc, _iter);		\
 	     (_iter) = extent_entry_next(_iter))
 
 #define bkey_for_each_crc(_k, _p, _crc, _iter)				\

From c0f0035d846e3239ced72091d74e90c674ec90aa Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jul 2024 23:39:49 -0400
Subject: [PATCH 36/61] bcachefs: Move rebalance_status out of sysfs/internal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index ce321e2097c0..4d63bfea4f8f 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -515,6 +515,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_btree_write_stats,
 
 	&sysfs_promote_whole_extents,
+	&sysfs_rebalance_status,
 
 	&sysfs_compression_stats,
 
@@ -614,7 +615,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_copy_gc_wait,
 
 	&sysfs_rebalance_enabled,
-	&sysfs_rebalance_status,
 	sysfs_pd_controller_files(rebalance),
 
 	&sysfs_moving_ctxts,

From 2e1175f8cc6f474e467d29fd23b3e524c9a835bb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 31 Jul 2024 23:56:04 -0400
Subject: [PATCH 37/61] bcachefs: promote_whole_extents is now a normal option

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h        | 1 -
 fs/bcachefs/bcachefs_format.h | 2 ++
 fs/bcachefs/io_read.c         | 2 +-
 fs/bcachefs/opts.h            | 5 +++++
 fs/bcachefs/super-io.c        | 3 +++
 fs/bcachefs/super.c           | 1 -
 fs/bcachefs/sysfs.c           | 6 ------
 7 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 3459b71206a7..d43bbdbac7ec 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1086,7 +1086,6 @@ struct bch_fs {
 	u64 __percpu		*counters;
 
 	unsigned		copy_gc_enabled:1;
-	bool			promote_whole_extents;
 
 	struct bch2_time_stats	times[BCH_TIME_STAT_NR];
 
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 14ce726bf5a3..8c4addddd07e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -795,6 +795,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS,		struct bch_sb, flags[0], 60, 61);
 LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
 
 LE64_BITMASK(BCH_SB_BIG_ENDIAN,		struct bch_sb, flags[0], 62, 63);
+LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
+					struct bch_sb, flags[0], 63, 64);
 
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,	struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 808017873d07..ce27ba1f0b54 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
 	 */
 	bool promote_full = (failed ||
 			     *read_full ||
-			     READ_ONCE(c->promote_whole_extents));
+			     READ_ONCE(c->opts.promote_whole_extents));
 	/* data might have to be decompressed in the write path: */
 	unsigned sectors = promote_full
 		? max(pick->crc.compressed_size, pick->crc.live_size)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 0138717e9f77..3d83bcdca69d 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -265,6 +265,11 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Enable inline data extents")			\
+	x(promote_whole_extents,	u8,				\
+	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
+	  OPT_BOOL(),							\
+	  BCH_SB_PROMOTE_WHOLE_EXTENTS,	true,				\
+	  NULL,		"Promote whole extents, instead of just part being read")\
 	x(acl,				u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index c8c2ccbdfbb5..2a72782d6d17 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -418,6 +418,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
 		    !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
 			SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
+
+		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
+			SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
 	}
 
 	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6d0ce3d73450..d8adf465f8b8 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -811,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	c->copy_gc_enabled		= 1;
 	c->rebalance.enabled		= 1;
-	c->promote_whole_extents	= true;
 
 	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
 	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 4d63bfea4f8f..9f8ca6a52245 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -219,7 +219,6 @@ read_attribute(copy_gc_wait);
 rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_status);
-rw_attribute(promote_whole_extents);
 
 read_attribute(new_stripes);
 
@@ -347,8 +346,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_rebalance_status)
 		bch2_rebalance_status_to_text(out, c);
 
-	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
-
 	/* Debugging: */
 
 	if (attr == &sysfs_journal_debug)
@@ -436,8 +433,6 @@ STORE(bch2_fs)
 
 	sysfs_pd_controller_store(rebalance,	&c->rebalance.pd);
 
-	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
-
 	/* Debugging: */
 
 	if (!test_bit(BCH_FS_started, &c->flags))
@@ -514,7 +509,6 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_btree_cache_size,
 	&sysfs_btree_write_stats,
 
-	&sysfs_promote_whole_extents,
 	&sysfs_rebalance_status,
 
 	&sysfs_compression_stats,

From ac22f3aa5b14d2d64feb43ae356dda2dd1b9292e Mon Sep 17 00:00:00 2001
From: Xiaxi Shen <shenxiaxi26@gmail.com>
Date: Wed, 7 Aug 2024 00:10:05 -0700
Subject: [PATCH 38/61] bcachefs: Fix a spelling error in docs

Signed-off-by: Xiaxi Shen <shenxiaxi26@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 Documentation/filesystems/bcachefs/CodingStyle.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst
index 0c45829a4899..01de555e21d8 100644
--- a/Documentation/filesystems/bcachefs/CodingStyle.rst
+++ b/Documentation/filesystems/bcachefs/CodingStyle.rst
@@ -175,7 +175,7 @@ errors in our thinking by running our code and seeing what happens. If your
 time is being wasted because your tools are bad or too slow - don't accept it,
 fix it.
 
-Put effort into your documentation, commmit messages, and code comments - but
+Put effort into your documentation, commit messages, and code comments - but
 don't go overboard. A good commit message is wonderful - but if the information
 was important enough to go in a commit message, ask yourself if it would be
 even better as a code comment.

From 5eb2d33109f9cb5ccfb088e58ab69a18f672a90f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Aug 2024 15:44:57 -0400
Subject: [PATCH 39/61] bcachefs: trivial open_bucket_add_buckets() cleanup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 8563c2d26847..084b03b8c6e9 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1022,9 +1022,6 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
 	open_bucket_for_each(c, ptrs, ob, i)
 		__clear_bit(ob->dev, devs.d);
 
-	if (erasure_code && ec_open_bucket(c, ptrs))
-		return 0;
-
 	ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
 				 nr_replicas, nr_effective,
 				 have_cache, erasure_code, flags);
@@ -1079,7 +1076,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
 {
 	int ret;
 
-	if (erasure_code) {
+	if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
 		ret = __open_bucket_add_buckets(trans, ptrs, wp,
 				devs_have, target, erasure_code,
 				nr_replicas, nr_effective, have_cache,

From 8628ecc8a03752097eaf489172284e79c838d364 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Aug 2024 11:40:47 -0400
Subject: [PATCH 40/61] bcachefs: bch2_sb_nr_devices()

factoring out a helper

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-members.c | 9 +++++++++
 fs/bcachefs/sb-members.h | 2 ++
 fs/bcachefs/super-io.c   | 9 ++-------
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 39196f2a4197..b4ea6490d71c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -464,3 +464,12 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
 		__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
 	}
 }
+
+unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
+{
+	unsigned nr = 0;
+
+	for (unsigned i = 0; i < sb->nr_devices; i++)
+		nr += bch2_member_exists((struct bch_sb *) sb, i);
+	return nr;
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index dd93192ec065..f307f2857603 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -307,6 +307,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
 	return false;
 }
 
+unsigned bch2_sb_nr_devices(const struct bch_sb *);
+
 static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 {
 	return (struct bch_member_cpu) {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 2a72782d6d17..d86d5dae54c9 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1295,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 		     bool print_layout, unsigned fields)
 {
-	u64 fields_have = 0;
-	unsigned nr_devices = 0;
-
 	if (!out->nr_tabstops)
 		printbuf_tabstop_push(out, 44);
 
-	for (int i = 0; i < sb->nr_devices; i++)
-		nr_devices += bch2_member_exists(sb, i);
-
 	prt_printf(out, "External UUID:\t");
 	pr_uuid(out, sb->user_uuid.b);
 	prt_newline(out);
@@ -1359,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 	prt_newline(out);
 
 	prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
-	prt_printf(out, "Devices:\t%u\n", nr_devices);
+	prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
 
 	prt_printf(out, "Sections:\t");
+	u64 fields_have = 0;
 	vstruct_for_each(sb, f)
 		fields_have |= 1 << le32_to_cpu(f->type);
 	prt_bitflags(out, bch2_sb_fields, fields_have);

From fb150ae595ab2ada2b173d442e24596bb17ce9de Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Sun, 11 Aug 2024 00:52:24 +0800
Subject: [PATCH 41/61] bcachefs: Remove unused parameter of bkey_mantissa

The idx parameter of bkey_mantissa became unused since commit
b904a7991802 ("bcachefs: Go back to 16 bit mantissa bkey floats"),
so remove it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 575e1d0b6eeb..61ccb56b26f5 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -585,8 +585,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
 }
 
 static inline unsigned bkey_mantissa(const struct bkey_packed *k,
-				     const struct bkey_float *f,
-				     unsigned idx)
+				     const struct bkey_float *f)
 {
 	u64 v;
 
@@ -668,7 +667,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
 	EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
 
 	f->exponent = shift;
-	mantissa = bkey_mantissa(m, f, j);
+	mantissa = bkey_mantissa(m, f);
 
 	/*
 	 * If we've got garbage bits, set them to all 1s - it's legal for the
@@ -1133,7 +1132,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 			goto slowpath;
 
 		l = f->mantissa;
-		r = bkey_mantissa(packed_search, f, n);
+		r = bkey_mantissa(packed_search, f);
 
 		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
 			goto slowpath;

From bf8ed7f7f6a7283c887ccdbc9ea3af59cebf78fa Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Sun, 11 Aug 2024 00:52:25 +0800
Subject: [PATCH 42/61] bcachefs: Remove unused parameter of
 bkey_mantissa_bits_dropped

The idx parameter of bkey_mantissa_bits_dropped is unused, remove it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 61ccb56b26f5..45e8ab5c3c5c 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -1097,8 +1097,7 @@ static inline void prefetch_four_cachelines(void *p)
 }
 
 static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
-					      const struct bkey_float *f,
-					      unsigned idx)
+					      const struct bkey_float *f)
 {
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 	unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
@@ -1134,7 +1133,7 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
 		l = f->mantissa;
 		r = bkey_mantissa(packed_search, f);
 
-		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
+		if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
 			goto slowpath;
 
 		n = n * 2 + (l < r);

From 1420e8e753c2d14533ae2ac3c40dfa217c9df539 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Sat, 10 Aug 2024 23:51:40 +0800
Subject: [PATCH 43/61] bcachefs: Remove dead code in __build_ro_aux_tree

This logic is no longer useful since commit
3ce8b463e3e0 ("bcachefs: kill bset_tree->max_key"), so remove it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 45e8ab5c3c5c..59158eb6d41f 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -753,9 +753,6 @@ retry:
 		EBUG_ON(tree_to_bkey(b, t, j) != k);
 	}
 
-	while (k != btree_bkey_last(b, t))
-		prev = k, k = bkey_p_next(k);
-
 	if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
 		bkey_init(&min_key.k);
 		min_key.k.p = b->data->min_key;

From 8a5264ff128f71377acd4babd91fded59995d7b9 Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Sun, 11 Aug 2024 00:11:46 +0800
Subject: [PATCH 44/61] bcachefs: Convert open-coded extra computation to
 helper

This patch replaces open-coded extra computation to eytzinger1_extra.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 59158eb6d41f..00a821f617a5 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -732,7 +732,7 @@ retry:
 		return;
 	}
 
-	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+	t->extra = eytzinger1_extra(t->size - 1);
 
 	/* First we figure out where the first key in each cacheline is */
 	eytzinger1_for_each(j, t->size - 1) {

From 00f6b3450c804daa0e11a6c01570bfc63b67563c Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Mon, 12 Aug 2024 16:06:09 +0800
Subject: [PATCH 45/61] bcachefs: Minimize the search range used to calculate
 the mantissa

When the search key's mantissa is larger than the node i's, we know that
the search key is larger than the first key of the cacheline corresponding
to node i, so that when we are calculating the mantissa of right side
nodes of node i, the left side of the search range can be the first key
of node i. Once the search range is minimized, the mantissa we are
calculating can have more useful bits, thus reduce the slow path
comparison. Besides, we can now remove all the prev array stuff.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 00a821f617a5..43b40f6b8c0a 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -616,7 +616,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
 	struct bkey_packed *m = tree_to_bkey(b, t, j);
 	struct bkey_packed *l = is_power_of_2(j)
 		? min_key
-		: tree_to_prev_bkey(b, t, j >> ffs(j));
+		: tree_to_bkey(b, t, j >> ffs(j));
 	struct bkey_packed *r = is_power_of_2(j + 1)
 		? max_key
 		: tree_to_bkey(b, t, j >> (ffz(j) + 1));

From 39041c223f01a389bff283ea40dc7812e7d5282a Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Mon, 12 Aug 2024 17:04:04 +0800
Subject: [PATCH 46/61] bcachefs: Remove the prev array stuff

After reducing the search range when building the aux tree, the prev array
stuff is no longer useful, so remove it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 34 ++++------------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 43b40f6b8c0a..0ccc1d962428 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -304,11 +304,6 @@ struct bkey_float {
 };
 #define BKEY_MANTISSA_BITS	16
 
-static unsigned bkey_float_byte_offset(unsigned idx)
-{
-	return idx * sizeof(struct bkey_float);
-}
-
 struct ro_aux_tree {
 	u8			nothing[0];
 	struct bkey_float	f[];
@@ -328,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
 		return t->aux_data_offset;
 	case BSET_RO_AUX_TREE:
 		return t->aux_data_offset +
-			DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
-				     t->size * sizeof(u8), 8);
+			DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
 	case BSET_RW_AUX_TREE:
 		return t->aux_data_offset +
 			DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@@ -360,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
 	return __aux_tree_base(b, t);
 }
 
-static u8 *ro_aux_tree_prev(const struct btree *b,
-			    const struct bset_tree *t)
-{
-	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-	return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
-}
-
 static struct bkey_float *bkey_float(const struct btree *b,
 				     const struct bset_tree *t,
 				     unsigned idx)
@@ -479,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
 			bkey_float(b, t, j)->key_offset);
 }
 
-static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
-					     const struct bset_tree *t,
-					     unsigned j)
-{
-	unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
-
-	return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
-}
-
 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
 				       const struct bset_tree *t)
 {
@@ -689,8 +666,7 @@ static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
 
 static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
 {
-	return __bset_tree_capacity(b, t) /
-		(sizeof(struct bkey_float) + sizeof(u8));
+	return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
 }
 
 static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
@@ -719,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
 
 static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
 {
-	struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
+	struct bkey_packed *k = btree_bkey_first(b, t);
 	struct bkey_i min_key, max_key;
 	unsigned cacheline = 1;
 
@@ -737,7 +713,7 @@ retry:
 	/* First we figure out where the first key in each cacheline is */
 	eytzinger1_for_each(j, t->size - 1) {
 		while (bkey_to_cacheline(b, t, k) < cacheline)
-			prev = k, k = bkey_p_next(k);
+			k = bkey_p_next(k);
 
 		if (k >= btree_bkey_last(b, t)) {
 			/* XXX: this path sucks */
@@ -745,11 +721,9 @@ retry:
 			goto retry;
 		}
 
-		ro_aux_tree_prev(b, t)[j] = prev->u64s;
 		bkey_float(b, t, j)->key_offset =
 			bkey_to_cacheline_offset(b, t, cacheline++, k);
 
-		EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
 		EBUG_ON(tree_to_bkey(b, t, j) != k);
 	}
 

From ab43d59dd12dc022f5525cc959d594767ee6f0cc Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Wed, 14 Aug 2024 22:20:07 +0800
Subject: [PATCH 47/61] bcachefs: Remove unused parameter

iter here is unused, remove it.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c               | 1 -
 fs/bcachefs/bset.h               | 4 ++--
 fs/bcachefs/btree_trans_commit.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 0ccc1d962428..1b66b2c7e018 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -969,7 +969,6 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 }
 
 void bch2_bset_insert(struct btree *b,
-		      struct btree_node_iter *iter,
 		      struct bkey_packed *where,
 		      struct bkey_i *insert,
 		      unsigned clobber_u64s)
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 5c6c7a14fa0f..6953d55b72cc 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -270,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
 
-void bch2_bset_insert(struct btree *, struct btree_node_iter *,
-		     struct bkey_packed *, struct bkey_i *, unsigned);
+void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
+		      unsigned);
 void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
 
 /* Bkey utility code */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index a0101d9c5d83..91884da4e30a 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -214,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
 
 	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
 overwrite:
-	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	bch2_bset_insert(b, k, insert, clobber_u64s);
 	new_u64s = k->u64s;
 fix_iter:
 	if (clobber_u64s != new_u64s)

From 7daf52dfc54a3f4fe003280a9dcacd3a7179d081 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Thu, 15 Aug 2024 16:57:43 +0800
Subject: [PATCH 48/61] bcachefs: drop unused posix acl handlers

Remove struct nop_posix_acl_{access,default} for bcachefs filesystem
that don't depend on the xattr handler in their inode->i_op->listxattr()
method in any way. There's nothing more to do than to simply remove the
handler. It's been effectively unused ever since we introduced the new
posix acl api. See [1] for details.

Link [1]: https://patchwork.kernel.org/project/linux-fsdevel/cover/20230125-fs-acl-remove-generic-xattr-handlers-v3-0-f760cc58967d@kernel.org/

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9f810fb36ba1..7b3d9fdae244 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -601,10 +601,6 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
 
 const struct xattr_handler *bch2_xattr_handlers[] = {
 	&bch_xattr_user_handler,
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-	&nop_posix_acl_access,
-	&nop_posix_acl_default,
-#endif
 	&bch_xattr_trusted_handler,
 	&bch_xattr_security_handler,
 #ifndef NO_BCACHEFS_FS

From a58b204f2690c77f433f6ca86366b8d5f6d2744d Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Thu, 15 Aug 2024 16:57:44 +0800
Subject: [PATCH 49/61] bcachefs: Simplify bch2_xattr_emit() implementation

Use helper functions to make code more readable.

Similar to commit a5488f29835c ("fs: simplify ->listxattr() implementation")

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 7b3d9fdae244..56c8d3fe55a4 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -250,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix,
 	return 0;
 }
 
+static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
+{
+	const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
+
+	if (!xattr_handler_can_list(handler, dentry))
+		return NULL;
+
+	return xattr_prefix(handler);
+}
+
 static int bch2_xattr_emit(struct dentry *dentry,
 			    const struct bch_xattr *xattr,
 			    struct xattr_buf *buf)
 {
-	const struct xattr_handler *handler =
-		bch2_xattr_type_to_handler(xattr->x_type);
+	const char *prefix;
 
-	return handler && (!handler->list || handler->list(dentry))
-		? __bch2_xattr_emit(handler->prefix ?: handler->name,
-				    xattr->x_name, xattr->x_name_len, buf)
-		: 0;
+	prefix = bch2_xattr_prefix(xattr->x_type, dentry);
+	if (!prefix)
+		return 0;
+
+	return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
 }
 
 static int bch2_xattr_list_bcachefs(struct bch_fs *c,

From 5e4aebc996c20471e087b474657e51b3d8ed68e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Aug 2024 15:11:20 -0400
Subject: [PATCH 50/61] bcachefs: Drop memalloc_nofs_save() in
 bch2_btree_node_mem_alloc()

It's really not needed: the only locks used here are the btree cache
lock, which we drop for GFP_WAIT allocations, and btree node locks - but
we also drop those for GFP_WAIT allocations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index e52a06d3418c..662f0f79b7af 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -671,9 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 		: &bc->freed_nonpcpu;
 	struct btree *b, *b2;
 	u64 start_time = local_clock();
-	unsigned flags;
 
-	flags = memalloc_nofs_save();
 	mutex_lock(&bc->lock);
 
 	/*
@@ -745,8 +743,6 @@ out:
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
 			       start_time);
 
-	memalloc_nofs_restore(flags);
-
 	int ret = bch2_trans_relock(trans);
 	if (unlikely(ret)) {
 		bch2_btree_node_to_freelist(c, b);
@@ -781,7 +777,6 @@ err:
 	}
 
 	mutex_unlock(&bc->lock);
-	memalloc_nofs_restore(flags);
 	return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
 }
 

From b2cb0efa285e962312c7230a582d9830494a6ea2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Aug 2024 15:33:38 -0400
Subject: [PATCH 51/61] bcachefs: bch2_time_stats_reset()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sysfs.c      |  9 ++++++++-
 fs/bcachefs/time_stats.c | 14 ++++++++++++++
 fs/bcachefs/time_stats.h |  3 ++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 9f8ca6a52245..4a373581b684 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -233,7 +233,7 @@ write_attribute(perf_test);
 
 #define x(_name)						\
 	static struct attribute sysfs_time_stat_##_name =		\
-		{ .name = #_name, .mode = 0444 };
+		{ .name = #_name, .mode = 0644 };
 	BCH_TIME_STATS()
 #undef x
 
@@ -722,6 +722,13 @@ SHOW(bch2_fs_time_stats)
 
 STORE(bch2_fs_time_stats)
 {
+	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
+
+#define x(name)								\
+	if (attr == &sysfs_time_stat_##name)				\
+		bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
+	BCH_TIME_STATS()
+#undef x
 	return size;
 }
 SYSFS_OPS(bch2_fs_time_stats);
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
index 4508e9dcbee2..3fe82757f93a 100644
--- a/fs/bcachefs/time_stats.c
+++ b/fs/bcachefs/time_stats.c
@@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
 	}
 }
 
+void bch2_time_stats_reset(struct bch2_time_stats *stats)
+{
+	spin_lock_irq(&stats->lock);
+	unsigned offset = offsetof(struct bch2_time_stats, min_duration);
+	memset((void *) stats + offset, 0, sizeof(*stats) - offset);
+
+	if (stats->buffer) {
+		int cpu;
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(stats->buffer, cpu)->nr = 0;
+	}
+	spin_unlock_irq(&stats->lock);
+}
+
 void bch2_time_stats_exit(struct bch2_time_stats *stats)
 {
 	free_percpu(stats->buffer);
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
index 5df61403744b..dc6493f7bbab 100644
--- a/fs/bcachefs/time_stats.h
+++ b/fs/bcachefs/time_stats.h
@@ -70,6 +70,7 @@ struct time_stat_buffer {
 struct bch2_time_stats {
 	spinlock_t	lock;
 	bool		have_quantiles;
+	struct time_stat_buffer __percpu *buffer;
 	/* all fields are in nanoseconds */
 	u64             min_duration;
 	u64		max_duration;
@@ -87,7 +88,6 @@ struct bch2_time_stats {
 
 	struct mean_and_variance_weighted duration_stats_weighted;
 	struct mean_and_variance_weighted freq_stats_weighted;
-	struct time_stat_buffer __percpu *buffer;
 };
 
 struct bch2_time_stats_quantiles {
@@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
 	return false;
 }
 
+void bch2_time_stats_reset(struct bch2_time_stats *);
 void bch2_time_stats_exit(struct bch2_time_stats *);
 void bch2_time_stats_init(struct bch2_time_stats *);
 

From c056baab980f9a1160fe39be14a80c83995392d5 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 20 Aug 2024 05:10:11 +0100
Subject: [PATCH 52/61] bcachefs: Do not check folio_has_private()

folio_has_private() is an attractive nuisance; filesystem authors
generally don't realise that it actually checks two flags (one of which
is never set by bcachefs).  There's no need to check the private flag at
all; for folios owned by bcachefs, we know that folio->private is NULL
when the private flag is clear and non-NULL when the private flag is set.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-pagecache.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index fd7d692c087e..fad911cf5068 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
 
 static inline struct bch_folio *__bch2_folio(struct folio *folio)
 {
-	return folio_has_private(folio)
-		? (struct bch_folio *) folio_get_private(folio)
-		: NULL;
+	return folio_get_private(folio);
 }
 
 static inline struct bch_folio *bch2_folio(struct folio *folio)

From 029aef1f61f9a2a2f2a038474715d1b8cbef1bf1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 30 Jun 2024 09:25:56 -0400
Subject: [PATCH 53/61] bcachefs: Assert that we don't lock nodes when
 !trans->locked

We rely on the trans->locked to know if a trans has nodes locked for
assertions about deadlocks; there can't be more than one trans in the
same process that is locked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_locking.h         |  7 +++----
 fs/bcachefs/btree_update_interior.c | 24 ++++++++++++------------
 fs/bcachefs/btree_update_interior.h |  2 ++
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index dd0a2a1aa2bc..7c07f9fa9add 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -218,14 +218,12 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 					 bool lock_may_not_fail,
 					 unsigned long ip)
 {
-	int ret;
-
 	trans->lock_may_not_fail = lock_may_not_fail;
 	trans->lock_must_abort	= false;
 	trans->locking		= b;
 
-	ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
-				 bch2_six_check_for_deadlock, trans, ip);
+	int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+				     bch2_six_check_for_deadlock, trans, ip);
 	WRITE_ONCE(trans->locking, NULL);
 	WRITE_ONCE(trans->locking_wait.start_time, 0);
 
@@ -284,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
 	int ret = 0;
 
 	EBUG_ON(level >= BTREE_MAX_DEPTH);
+	bch2_trans_verify_not_unlocked(trans);
 
 	if (likely(six_trylock_type(&b->lock, type)) ||
 	    btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 396d052474bb..1433aefb47fd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -731,6 +731,18 @@ static void btree_update_nodes_written(struct btree_update *as)
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
 			     "%s", bch2_err_str(ret));
 err:
+	/*
+	 * Ensure transaction is unlocked before using btree_node_lock_nopath()
+	 * (the use of which is always suspect, we need to work on removing this
+	 * in the future)
+	 *
+	 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
+	 * calls bch2_path_upgrade(), before we call path_make_mut(), so we may
+	 * rarely end up with a locked path besides the one we have here:
+	 */
+	bch2_trans_unlock(trans);
+	bch2_trans_begin(trans);
+
 	/*
 	 * We have to be careful because another thread might be getting ready
 	 * to free as->b and calling btree_update_reparent() on us - we'll
@@ -750,18 +762,6 @@ err:
 		 * we're in journal error state:
 		 */
 
-		/*
-		 * Ensure transaction is unlocked before using
-		 * btree_node_lock_nopath() (the use of which is always suspect,
-		 * we need to work on removing this in the future)
-		 *
-		 * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
-		 * calls bch2_path_upgrade(), before we call path_make_mut(), so
-		 * we may rarely end up with a locked path besides the one we
-		 * have here:
-		 */
-		bch2_trans_unlock(trans);
-		bch2_trans_begin(trans);
 		btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
 						as->btree_id, b->c.level, b->key.k.p);
 		struct btree_path *path = trans->paths + path_idx;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 02c6ecada97c..10f400957f21 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
 					      unsigned level,
 					      unsigned flags)
 {
+	bch2_trans_verify_not_unlocked(trans);
+
 	return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
 						    btree_prev_sib) ?:
 		bch2_foreground_maybe_merge_sibling(trans, path, level, flags,

From cf9450abcc26f010c0f45c8ebe43473753ea65ca Mon Sep 17 00:00:00 2001
From: Alan Huang <mmpgouride@gmail.com>
Date: Thu, 15 Aug 2024 23:40:53 +0800
Subject: [PATCH 54/61] bcachefs: Refactor bch2_bset_fix_lookup_table

bch2_bset_fix_lookup_table is too complicated to be easily understood,
the comment "l now > where" there is also incorrect when where ==
t->end_offset. This patch therefore refactor the function, the idea is
that when where >= rw_aux_tree(b, t)[t->size - 1].offset, we don't need
to adjust the rw aux tree.

Signed-off-by: Alan Huang <mmpgouride@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bset.c | 130 ++++++++++++++++++++++++---------------------
 1 file changed, 69 insertions(+), 61 deletions(-)

diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 1b66b2c7e018..d1f6092624d8 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -885,66 +885,18 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 
 /* Insert */
 
-static void bch2_bset_fix_lookup_table(struct btree *b,
-				       struct bset_tree *t,
-				       struct bkey_packed *_where,
-				       unsigned clobber_u64s,
-				       unsigned new_u64s)
+static void rw_aux_tree_insert_entry(struct btree *b,
+				     struct bset_tree *t,
+				     unsigned idx)
 {
-	int shift = new_u64s - clobber_u64s;
-	unsigned l, j, where = __btree_node_key_to_offset(b, _where);
-
-	EBUG_ON(bset_has_ro_aux_tree(t));
-
-	if (!bset_has_rw_aux_tree(t))
-		return;
-
-	/* returns first entry >= where */
-	l = rw_aux_tree_bsearch(b, t, where);
-
-	if (!l) /* never delete first entry */
-		l++;
-	else if (l < t->size &&
-		 where < t->end_offset &&
-		 rw_aux_tree(b, t)[l].offset == where)
-		rw_aux_tree_set(b, t, l++, _where);
-
-	/* l now > where */
-
-	for (j = l;
-	     j < t->size &&
-	     rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
-	     j++)
-		;
-
-	if (j < t->size &&
-	    rw_aux_tree(b, t)[j].offset + shift ==
-	    rw_aux_tree(b, t)[l - 1].offset)
-		j++;
-
-	memmove(&rw_aux_tree(b, t)[l],
-		&rw_aux_tree(b, t)[j],
-		(void *) &rw_aux_tree(b, t)[t->size] -
-		(void *) &rw_aux_tree(b, t)[j]);
-	t->size -= j - l;
-
-	for (j = l; j < t->size; j++)
-		rw_aux_tree(b, t)[j].offset += shift;
-
-	EBUG_ON(l < t->size &&
-		rw_aux_tree(b, t)[l].offset ==
-		rw_aux_tree(b, t)[l - 1].offset);
+	EBUG_ON(!idx || idx > t->size);
+	struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
+	struct bkey_packed *end = idx < t->size
+				  ? rw_aux_to_bkey(b, t, idx)
+				  : btree_bkey_last(b, t);
 
 	if (t->size < bset_rw_tree_capacity(b, t) &&
-	    (l < t->size
-	     ? rw_aux_tree(b, t)[l].offset
-	     : t->end_offset) -
-	    rw_aux_tree(b, t)[l - 1].offset >
-	    L1_CACHE_BYTES / sizeof(u64)) {
-		struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
-		struct bkey_packed *end = l < t->size
-			? rw_aux_to_bkey(b, t, l)
-			: btree_bkey_last(b, t);
+	    (void *) end - (void *) start > L1_CACHE_BYTES) {
 		struct bkey_packed *k = start;
 
 		while (1) {
@@ -953,17 +905,73 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
 				break;
 
 			if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
-				memmove(&rw_aux_tree(b, t)[l + 1],
-					&rw_aux_tree(b, t)[l],
+				memmove(&rw_aux_tree(b, t)[idx + 1],
+					&rw_aux_tree(b, t)[idx],
 					(void *) &rw_aux_tree(b, t)[t->size] -
-					(void *) &rw_aux_tree(b, t)[l]);
+					(void *) &rw_aux_tree(b, t)[idx]);
 				t->size++;
-				rw_aux_tree_set(b, t, l, k);
+				rw_aux_tree_set(b, t, idx, k);
 				break;
 			}
 		}
 	}
+}
 
+static void bch2_bset_fix_lookup_table(struct btree *b,
+				       struct bset_tree *t,
+				       struct bkey_packed *_where,
+				       unsigned clobber_u64s,
+				       unsigned new_u64s)
+{
+	int shift = new_u64s - clobber_u64s;
+	unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
+
+	EBUG_ON(bset_has_ro_aux_tree(t));
+
+	if (!bset_has_rw_aux_tree(t))
+		return;
+
+	if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
+		rw_aux_tree_insert_entry(b, t, t->size);
+		goto verify;
+	}
+
+	/* returns first entry >= where */
+	idx = rw_aux_tree_bsearch(b, t, where);
+
+	if (rw_aux_tree(b, t)[idx].offset == where) {
+		if (!idx) { /* never delete first entry */
+			idx++;
+		} else if (where < t->end_offset) {
+			rw_aux_tree_set(b, t, idx++, _where);
+		} else {
+			EBUG_ON(where != t->end_offset);
+			rw_aux_tree_insert_entry(b, t, --t->size);
+			goto verify;
+		}
+	}
+
+	EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
+	if (idx < t->size &&
+	    rw_aux_tree(b, t)[idx].offset + shift ==
+	    rw_aux_tree(b, t)[idx - 1].offset) {
+		memmove(&rw_aux_tree(b, t)[idx],
+			&rw_aux_tree(b, t)[idx + 1],
+			(void *) &rw_aux_tree(b, t)[t->size] -
+			(void *) &rw_aux_tree(b, t)[idx + 1]);
+		t->size -= 1;
+	}
+
+	for (j = idx; j < t->size; j++)
+		rw_aux_tree(b, t)[j].offset += shift;
+
+	EBUG_ON(idx < t->size &&
+		rw_aux_tree(b, t)[idx].offset ==
+		rw_aux_tree(b, t)[idx - 1].offset);
+
+	rw_aux_tree_insert_entry(b, t, idx);
+
+verify:
 	bch2_bset_verify_rw_aux_tree(b, t);
 	bset_aux_tree_verify(b);
 }

From 6e9852d9ee4a6a36ab90be96c61025a7674d2e3d Mon Sep 17 00:00:00 2001
From: Chen Yufan <chenyufan@vivo.com>
Date: Thu, 22 Aug 2024 10:57:31 +0800
Subject: [PATCH 55/61] bcachefs: Convert to use jiffies macros

Use jiffies macros instead of using jiffies directly to handle wraparound.

Signed-off-by: Chen Yufan <chenyufan@vivo.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 3 ++-
 fs/bcachefs/journal_io.c       | 3 ++-
 fs/bcachefs/thread_with_file.c | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ba46f1c1d78a..b54ce7f88927 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -30,6 +30,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched/task.h>
 #include <linux/sort.h>
+#include <linux/jiffies.h>
 
 static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
 
@@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 	 * freespace/need_discard/need_gc_gens btrees as needed:
 	 */
 	while (1) {
-		if (last_updated + HZ * 10 < jiffies) {
+		if (time_after(jiffies, last_updated + HZ * 10)) {
 			bch_info(ca, "%s: currently at %llu/%llu",
 				 __func__, iter.pos.offset, ca->mi.nbuckets);
 			last_updated = jiffies;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7664b68e6a15..32b886feb2ca 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1950,7 +1950,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 	if (error ||
 	    w->noflush ||
 	    (!w->must_flush &&
-	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+	     time_before(jiffies, j->last_flush_write +
+		 msecs_to_jiffies(c->opts.journal_flush_delay)) &&
 	     test_bit(JOURNAL_may_skip_flush, &j->flags))) {
 		w->noflush = true;
 		SET_JSET_NO_FLUSH(w->data, true);
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 0807ce9b171a..fb3442a7c67f 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -387,7 +387,7 @@ again:
 	seen = buf->buf.nr;
 	char *n = memchr(buf->buf.data, '\n', seen);
 
-	if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
+	if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
 		spin_unlock(&buf->lock);
 		return -ETIME;
 	}

From 9b617df38895a1bbbfb380422b8368c364dbca67 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Aug 2024 03:50:22 -0400
Subject: [PATCH 56/61] bcachefs: darray: convert to alloc_hooks()

better memory allocation profiling support

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/darray.c |  4 ++--
 fs/bcachefs/darray.h | 24 +++++++++---------------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index b7d223f85873..4f06cd8bbbe1 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -4,12 +4,12 @@
 #include <linux/slab.h>
 #include "darray.h"
 
-int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
+int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
 {
 	if (new_size > d->size) {
 		new_size = roundup_pow_of_two(new_size);
 
-		void *data = kvmalloc_array(new_size, element_size, gfp);
+		void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
 		if (!data)
 			return -ENOMEM;
 
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index 4b340d13caac..8f4c3f0665c4 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -22,29 +22,23 @@ struct {								\
 typedef DARRAY(char)	darray_char;
 typedef DARRAY(char *) darray_str;
 
-int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
+int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
 
-static inline int __darray_resize(darray_char *d, size_t element_size,
-				  size_t new_size, gfp_t gfp)
-{
-	return unlikely(new_size > d->size)
-		? __bch2_darray_resize(d, element_size, new_size, gfp)
-		: 0;
-}
+#define __bch2_darray_resize(...)	alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
+
+#define __darray_resize(_d, _element_size, _new_size, _gfp)		\
+	(unlikely((_new_size) > (_d)->size)				\
+	 ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
+	 : 0)
 
 #define darray_resize_gfp(_d, _new_size, _gfp)				\
-	unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
+	__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
 
 #define darray_resize(_d, _new_size)					\
 	darray_resize_gfp(_d, _new_size, GFP_KERNEL)
 
-static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
-{
-	return __darray_resize(d, t_size, d->nr + more, gfp);
-}
-
 #define darray_make_room_gfp(_d, _more, _gfp)				\
-	__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
+	darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
 
 #define darray_make_room(_d, _more)					\
 	darray_make_room_gfp(_d, _more, GFP_KERNEL)

From 3a1294051d818a39dc753513a6b5173b09b59667 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 23 Aug 2024 18:21:31 -0400
Subject: [PATCH 57/61] bcachefs: rcu_pending now works in userspace

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/rcu_pending.c | 55 ++++++++++++++++++++++++++++++++++++---
 fs/bcachefs/rcu_pending.h |  2 ++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
index 8f8d914d3998..40a20192eee8 100644
--- a/fs/bcachefs/rcu_pending.c
+++ b/fs/bcachefs/rcu_pending.c
@@ -115,24 +115,40 @@ static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
 static void rcu_pending_list_merge(struct rcu_pending_list *l1,
 				   struct rcu_pending_list *l2)
 {
+#ifdef __KERNEL__
 	if (!l1->head)
 		l1->head = l2->head;
 	else
 		l1->tail->next = l2->head;
-	l1->tail = l2->tail;
+#else
+	if (!l1->head)
+		l1->head = l2->head;
+	else
+		l1->tail->next.next = (void *) l2->head;
+#endif
 
+	l1->tail = l2->tail;
 	l2->head = l2->tail = NULL;
 }
 
 static void rcu_pending_list_add(struct rcu_pending_list *l,
 				 struct rcu_head *n)
 {
+#ifdef __KERNEL__
 	if (!l->head)
 		l->head = n;
 	else
 		l->tail->next = n;
 	l->tail = n;
 	n->next = NULL;
+#else
+	if (!l->head)
+		l->head = n;
+	else
+		l->tail->next.next = (void *) n;
+	l->tail = n;
+	n->next.next = NULL;
+#endif
 }
 
 static void merge_expired_lists(struct rcu_pending_pcpu *p)
@@ -144,6 +160,19 @@ static void merge_expired_lists(struct rcu_pending_pcpu *p)
 			rcu_pending_list_merge(expired, i);
 }
 
+#ifndef __KERNEL__
+static inline void kfree_bulk(size_t nr, void ** p)
+{
+	while (nr--)
+		kfree(*p);
+}
+
+#define local_irq_save(flags)		\
+do {					\
+	flags = 0;			\
+} while (0)
+#endif
+
 static noinline void __process_finished_items(struct rcu_pending *pending,
 					      struct rcu_pending_pcpu *p,
 					      unsigned long flags)
@@ -177,7 +206,11 @@ static noinline void __process_finished_items(struct rcu_pending *pending,
 
 		while (list) {
 			struct rcu_head *obj = list;
+#ifdef __KERNEL__
 			list = obj->next;
+#else
+			list = (void *) obj->next.next;
+#endif
 
 			/*
 			 * low bit of pointer indicates whether rcu_head needs
@@ -204,7 +237,11 @@ static noinline void __process_finished_items(struct rcu_pending *pending,
 
 		while (list) {
 			struct rcu_head *obj = list;
+#ifdef __KERNEL__
 			list = obj->next;
+#else
+			list = (void *) obj->next.next;
+#endif
 			obj->func(obj);
 		}
 		break;
@@ -216,7 +253,11 @@ static noinline void __process_finished_items(struct rcu_pending *pending,
 
 		while (list) {
 			struct rcu_head *obj = list;
+#ifdef __KERNEL__
 			list = obj->next;
+#else
+			list = (void *) obj->next.next;
+#endif
 			pending->process(pending, obj);
 		}
 		break;
@@ -265,11 +306,13 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu)
 
 	unsigned long flags;
 	spin_lock_irqsave(&p->lock, flags);
-	if (__rcu_pending_has_pending(p))
+	if (__rcu_pending_has_pending(p)) {
+		spin_unlock_irqrestore(&p->lock, flags);
 		__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
-	else
+	} else {
 		p->cb_armed = false;
-	spin_unlock_irqrestore(&p->lock, flags);
+		spin_unlock_irqrestore(&p->lock, flags);
+	}
 }
 
 static __always_inline struct rcu_pending_seq *
@@ -489,7 +532,11 @@ static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
 	static_array_for_each(p->lists, i)
 		if (i->head) {
 			ret = i->head;
+#ifdef __KERNEL__
 			i->head = ret->next;
+#else
+			i->head = (void *) ret->next.next;
+#endif
 			if (!i->head)
 				i->tail = NULL;
 			goto out;
diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h
index a875c640da8d..71a2f4ddaade 100644
--- a/fs/bcachefs/rcu_pending.h
+++ b/fs/bcachefs/rcu_pending.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_RCU_PENDING_H
 #define _LINUX_RCU_PENDING_H
 
+#include <linux/rcupdate.h>
+
 struct rcu_pending;
 typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
 

From 926ed267d5e37fa38b2f40d890ae4db5b5d02f4c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 24 Aug 2024 11:38:21 -0400
Subject: [PATCH 58/61] bcachefs: Switch gc bucket array to a genradix

A user with a 30 tb device is overflowing the INT_MAX limit on vmalloc
allocations...

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h      |  2 +-
 fs/bcachefs/btree_gc.c      | 18 ++++--------------
 fs/bcachefs/buckets.h       | 15 +--------------
 fs/bcachefs/buckets_types.h |  8 --------
 4 files changed, 6 insertions(+), 37 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d43bbdbac7ec..d74b8e547ee5 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -542,7 +542,7 @@ struct bch_dev {
 	 * gc_gens_lock, for device resize - holding any is sufficient for
 	 * access: Or rcu_read_lock(), but only for dev_ptr_stale():
 	 */
-	struct bucket_array __rcu *buckets_gc;
+	GENRADIX(struct bucket)	buckets_gc;
 	struct bucket_gens __rcu *bucket_gens;
 	u8			*oldest_gen;
 	unsigned long		*buckets_nouse;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index eb3002c4eae7..120ffd68ab0a 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -753,10 +753,8 @@ static void bch2_gc_free(struct bch_fs *c)
 	genradix_free(&c->reflink_gc_table);
 	genradix_free(&c->gc_stripes);
 
-	for_each_member_device(c, ca) {
-		kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
-		ca->buckets_gc = NULL;
-	}
+	for_each_member_device(c, ca)
+		genradix_free(&ca->buckets_gc);
 }
 
 static int bch2_gc_start(struct bch_fs *c)
@@ -910,20 +908,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
 	int ret = 0;
 
 	for_each_member_device(c, ca) {
-		struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
-				ca->mi.nbuckets * sizeof(struct bucket),
-				GFP_KERNEL|__GFP_ZERO);
-		if (!buckets) {
+		ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
+		if (ret) {
 			bch2_dev_put(ca);
 			ret = -BCH_ERR_ENOMEM_gc_alloc_start;
 			break;
 		}
-
-		buckets->first_bucket	= ca->mi.first_bucket;
-		buckets->nbuckets	= ca->mi.nbuckets;
-		buckets->nbuckets_minus_first =
-			buckets->nbuckets - buckets->first_bucket;
-		rcu_assign_pointer(ca->buckets_gc, buckets);
 	}
 
 	bch_err_fn(c, ret);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index edbdffd508fc..e2cb7b24b220 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
 			 TASK_UNINTERRUPTIBLE);
 }
 
-static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
-{
-	return rcu_dereference_check(ca->buckets_gc,
-				     !ca->fs ||
-				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-				     lockdep_is_held(&ca->fs->state_lock) ||
-				     lockdep_is_held(&ca->bucket_lock));
-}
-
 static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 {
-	struct bucket_array *buckets = gc_bucket_array(ca);
-
-	if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
-		return NULL;
-	return buckets->b + b;
+	return genradix_ptr(&ca->buckets_gc, b);
 }
 
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index a19460a1bb7c..28bd09a253c8 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -19,14 +19,6 @@ struct bucket {
 	u32			stripe_sectors;
 } __aligned(sizeof(long));
 
-struct bucket_array {
-	struct rcu_head		rcu;
-	u16			first_bucket;
-	size_t			nbuckets;
-	size_t			nbuckets_minus_first;
-	struct bucket		b[] __counted_by(nbuckets);
-};
-
 struct bucket_gens {
 	struct rcu_head		rcu;
 	u16			first_bucket;

From 46e6d838cf3171ba73a251138acc9060771ee731 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Sat, 24 Aug 2024 15:57:41 +0200
Subject: [PATCH 59/61] bcachefs: Annotate struct bch_xattr with __counted_by()

Add the __counted_by compiler attribute to the flexible array member
x_name to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr_format.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index e9f810539552..c7916011ef34 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -13,7 +13,7 @@ struct bch_xattr {
 	__u8			x_type;
 	__u8			x_name_len;
 	__le16			x_val_len;
-	__u8			x_name[];
+	__u8			x_name[] __counted_by(x_name_len);
 } __packed __aligned(8);
 
 #endif /* _BCACHEFS_XATTR_FORMAT_H */

From ff8ec216d806d8421e017e9e10a27bd93fbeb5da Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Sat, 24 Aug 2024 09:27:24 +0800
Subject: [PATCH 60/61] bcachefs: support idmap mounts

We enable idmapped mounts for bcachefs. Here, we just pass down
the user_namespace argument from the VFS methods to the relevant
helpers.

The idmap test in bcachefs is as following:

```
1. losetup /dev/loop1 bcachefs.img
2. ./bcachefs format /dev/loop1
3. mount -t bcachefs /dev/loop1 /mnt/bcachefs/
4. ./mount-idmapped --map-mount b:0:1000:1 /mnt/bcachefs /mnt/idmapped1/

ll /mnt/bcachefs
total 2
drwx------. 2 root root    0 Jun 14 14:10 lost+found
-rw-r--r--. 1 root root 1945 Jun 14 14:12 profile

ll /mnt/idmapped1/

total 2
drwx------. 2 1000 1000    0 Jun 14 14:10 lost+found
-rw-r--r--. 1 1000 1000 1945 Jun 14 14:12 profile

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 50 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 7a9c164cbfa3..4414a9f0c36e 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -380,6 +380,8 @@ __bch2_create(struct mnt_idmap *idmap,
 	subvol_inum inum;
 	struct bch_subvolume subvol;
 	u64 journal_seq = 0;
+	kuid_t kuid;
+	kgid_t kgid;
 	int ret;
 
 	/*
@@ -406,13 +408,15 @@ __bch2_create(struct mnt_idmap *idmap,
 retry:
 	bch2_trans_begin(trans);
 
+	kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
+	kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
 	ret   = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
 		bch2_create_trans(trans,
 				  inode_inum(dir), &dir_u, &inode_u,
 				  !(flags & BCH_CREATE_TMPFILE)
 				  ? &dentry->d_name : NULL,
-				  from_kuid(i_user_ns(&dir->v), current_fsuid()),
-				  from_kgid(i_user_ns(&dir->v), current_fsgid()),
+				  from_kuid(i_user_ns(&dir->v), kuid),
+				  from_kgid(i_user_ns(&dir->v), kgid),
 				  mode, rdev,
 				  default_acl, acl, snapshot_src, flags) ?:
 		bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@@ -835,11 +839,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	unsigned int ia_valid = attr->ia_valid;
+	kuid_t kuid;
+	kgid_t kgid;
 
-	if (ia_valid & ATTR_UID)
-		bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
-	if (ia_valid & ATTR_GID)
-		bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+	if (ia_valid & ATTR_UID) {
+		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+		bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
+	}
+	if (ia_valid & ATTR_GID) {
+		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+		bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
+	}
 
 	if (ia_valid & ATTR_SIZE)
 		bi->bi_size = attr->ia_size;
@@ -854,11 +864,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 		kgid_t gid = ia_valid & ATTR_GID
-			? attr->ia_gid
+			? kgid
 			: inode->v.i_gid;
 
-		if (!in_group_p(gid) &&
-		    !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
+		if (!in_group_or_capable(idmap, &inode->v,
+			make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
 			mode &= ~S_ISGID;
 		bi->bi_mode = mode;
 	}
@@ -874,17 +884,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 	struct btree_iter inode_iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *acl = NULL;
+	kuid_t kuid;
+	kgid_t kgid;
 	int ret;
 
 	mutex_lock(&inode->ei_update_lock);
 
 	qid = inode->ei_qid;
 
-	if (attr->ia_valid & ATTR_UID)
-		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
+	if (attr->ia_valid & ATTR_UID) {
+		kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
+		qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
+	}
 
-	if (attr->ia_valid & ATTR_GID)
-		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
+	if (attr->ia_valid & ATTR_GID) {
+		kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
+		qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
+	}
 
 	ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
 				     KEY_TYPE_QUOTA_PREALLOC);
@@ -940,13 +956,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
 {
 	struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
+	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
 
 	stat->dev	= inode->v.i_sb->s_dev;
 	stat->ino	= inode->v.i_ino;
 	stat->mode	= inode->v.i_mode;
 	stat->nlink	= inode->v.i_nlink;
-	stat->uid	= inode->v.i_uid;
-	stat->gid	= inode->v.i_gid;
+	stat->uid	= vfsuid_into_kuid(vfsuid);
+	stat->gid	= vfsgid_into_kgid(vfsgid);
 	stat->rdev	= inode->v.i_rdev;
 	stat->size	= i_size_read(&inode->v);
 	stat->atime	= inode_get_atime(&inode->v);
@@ -2209,7 +2227,7 @@ static struct file_system_type bcache_fs_type = {
 	.name			= "bcachefs",
 	.init_fs_context	= bch2_init_fs_context,
 	.kill_sb		= bch2_kill_sb,
-	.fs_flags		= FS_REQUIRES_DEV,
+	.fs_flags		= FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
 };
 
 MODULE_ALIAS_FS("bcachefs");

From b0e4d11e0214048872065fd82680e88ed1d9f379 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Mon, 26 Aug 2024 12:11:36 +0200
Subject: [PATCH 61/61] bcachefs: Annotate bch_replicas_entry_{v0,v1} with
 __counted_by()

Add the __counted_by compiler attribute to the flexible array members
devs to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Increment nr_devs before adding a new device to the devs array and
adjust the array indexes accordingly. Add a helper macro for adding a
new device.

In bch2_journal_read(), explicitly set nr_devs to 0.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c         | 2 +-
 fs/bcachefs/journal_io.c      | 3 ++-
 fs/bcachefs/replicas.c        | 6 +++---
 fs/bcachefs/replicas_format.h | 9 +++++++--
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index a2274429e7f4..86d1c26384bd 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -740,7 +740,7 @@ static int __trigger_extent(struct btree_trans *trans,
 				return ret;
 		} else if (!p.has_ec) {
 			*replicas_sectors       += disk_sectors;
-			acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
+			replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
 		} else {
 			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
 			if (ret)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 32b886feb2ca..30460bce04be 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		struct bch_replicas_padded replicas = {
 			.e.data_type = BCH_DATA_journal,
+			.e.nr_devs = 0,
 			.e.nr_required = 1,
 		};
 
@@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
 			goto err;
 
 		darray_for_each(i->ptrs, ptr)
-			replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
+			replicas_entry_add_dev(&replicas.e, ptr->dev);
 
 		bch2_replicas_entry_sort(&replicas.e);
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 12b1d28b7eb4..e0880cb79345 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -122,7 +122,7 @@ static void extent_to_replicas(struct bkey_s_c k,
 			continue;
 
 		if (!p.has_ec)
-			r->devs[r->nr_devs++] = p.ptr.dev;
+			replicas_entry_add_dev(r, p.ptr.dev);
 		else
 			r->nr_required = 0;
 	}
@@ -139,7 +139,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
 	for (ptr = s.v->ptrs;
 	     ptr < s.v->ptrs + s.v->nr_blocks;
 	     ptr++)
-		r->devs[r->nr_devs++] = ptr->dev;
+		replicas_entry_add_dev(r, ptr->dev);
 }
 
 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@@ -180,7 +180,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
 	e->nr_required	= 1;
 
 	darray_for_each(devs, i)
-		e->devs[e->nr_devs++] = *i;
+		replicas_entry_add_dev(e, *i);
 
 	bch2_replicas_entry_sort(e);
 }
diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h
index b97208195d06..b7eff904acdb 100644
--- a/fs/bcachefs/replicas_format.h
+++ b/fs/bcachefs/replicas_format.h
@@ -5,7 +5,7 @@
 struct bch_replicas_entry_v0 {
 	__u8			data_type;
 	__u8			nr_devs;
-	__u8			devs[];
+	__u8			devs[] __counted_by(nr_devs);
 } __packed;
 
 struct bch_sb_field_replicas_v0 {
@@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
-	__u8			devs[];
+	__u8			devs[] __counted_by(nr_devs);
 } __packed;
 
 struct bch_sb_field_replicas {
@@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
 #define replicas_entry_bytes(_i)					\
 	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
 
+#define replicas_entry_add_dev(e, d) ({					\
+	(e)->nr_devs++;							\
+	(e)->devs[(e)->nr_devs - 1] = (d);				\
+})
+
 #endif /* _BCACHEFS_REPLICAS_FORMAT_H */