bcachefs: Don't block on journal reservation with btree locks held

Fixes a deadlock between the allocator thread, when it first starts up, and journal replay Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-11-16 08:44:21 +08:00 · 2018-11-18 21:35:59 -05:00 · 2018-11-18 21:35:59 -05:00 · f1a79365a7
commit f1a79365a7
parent 129550c4d0
4 changed files with 108 additions and 59 deletions
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@ -638,12 +638,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
 	int ret;

 	ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-	if (ret < 0)
-		goto err;
-	if (!ret) {
+	if (ret == -EAGAIN) {
 		continue_at(cl, btree_update_wait_on_journal, system_wq);
 		return;
 	}
+	if (ret < 0)
+		goto err;

 	bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@ -344,19 +344,35 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
 	trans_for_each_entry(trans, i)
 		BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);

-	u64s = 0;
-	trans_for_each_entry(trans, i)
-		u64s += jset_u64s(i->k->k.u64s);
-
 	memset(&trans->journal_res, 0, sizeof(trans->journal_res));

-	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
-		? bch2_journal_res_get(&c->journal,
-				      &trans->journal_res,
-				      u64s, u64s)
-		: 0;
-	if (ret)
-		return ret;
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+		u64s = 0;
+		trans_for_each_entry(trans, i)
+			u64s += jset_u64s(i->k->k.u64s);
+
+		while ((ret = bch2_journal_res_get(&c->journal,
+					&trans->journal_res, u64s,
+					JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
+			struct btree_iter *iter = trans->entries[0].iter;
+
+			bch2_btree_iter_unlock(iter);
+
+			ret = bch2_journal_res_get(&c->journal,
+					&trans->journal_res, u64s,
+					JOURNAL_RES_GET_CHECK);
+			if (ret)
+				return ret;
+
+			if (!bch2_btree_iter_relock(iter)) {
+				trans_restart(" (iter relock after journal res get blocked)");
+				return -EINTR;
+			}
+		}
+
+		if (ret)
+			return ret;
+	}

 	multi_lock_write(c, trans);

--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@ -335,15 +335,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 }

 static int __journal_res_get(struct journal *j, struct journal_res *res,
-			      unsigned u64s_min, unsigned u64s_max)
+			     unsigned flags)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *buf;
 	int ret;
 retry:
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret)
-		return ret;
+	if (journal_res_get_fast(j, res, flags))
+		return 0;

 	spin_lock(&j->lock);
 	/*
@ -351,10 +350,9 @@ retry:
 	 * that just did journal_entry_open() and call journal_entry_close()
 	 * unnecessarily
 	 */
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret) {
+	if (journal_res_get_fast(j, res, flags)) {
 		spin_unlock(&j->lock);
-		return 1;
+		return 0;
 	}

 	/*
@ -377,7 +375,12 @@ retry:
 		spin_unlock(&j->lock);
 		return -EROFS;
 	case JOURNAL_ENTRY_INUSE:
-		/* haven't finished writing out the previous one: */
+		/*
+		 * The current journal entry is still open, but we failed to get
+		 * a journal reservation because there's not enough space in it,
+		 * and we can't close it and start another because we haven't
+		 * finished writing out the previous entry:
+		 */
 		spin_unlock(&j->lock);
 		trace_journal_entry_full(c);
 		goto blocked;
@ -408,7 +411,7 @@ retry:
 blocked:
 	if (!j->res_get_blocked_start)
 		j->res_get_blocked_start = local_clock() ?: 1;
-	return 0;
+	return -EAGAIN;
 }

 /*
@ -422,14 +425,14 @@ blocked:
 * btree node write locks.
 */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				 unsigned u64s_min, unsigned u64s_max)
+				  unsigned flags)
 {
 	int ret;

 	wait_event(j->wait,
-		   (ret = __journal_res_get(j, res, u64s_min,
-					    u64s_max)));
-	return ret < 0 ? ret : 0;
+		   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK));
+	return ret;
 }

 u64 bch2_journal_last_unwritten_seq(struct journal *j)
@ -453,28 +456,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
 * btree root - every journal entry contains the roots of all the btrees, so it
 * doesn't need to bother with getting a journal reservation
 */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
-	int ret;
-
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	bool need_reclaim = false;
+retry:
 	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));

 	if (seq < journal_cur_seq(j) ||
 	    journal_entry_is_open(j)) {
 		spin_unlock(&j->lock);
-		return 1;
+		return 0;
+	}
+
+	if (journal_cur_seq(j) < seq) {
+		switch (journal_buf_switch(j, false)) {
+		case JOURNAL_ENTRY_ERROR:
+			spin_unlock(&j->lock);
+			return -EROFS;
+		case JOURNAL_ENTRY_INUSE:
+			/* haven't finished writing out the previous one: */
+			trace_journal_entry_full(c);
+			goto blocked;
+		case JOURNAL_ENTRY_CLOSED:
+			break;
+		case JOURNAL_UNLOCKED:
+			goto retry;
+		}
+	}
+
+	BUG_ON(journal_cur_seq(j) < seq);
+
+	if (!journal_entry_open(j)) {
+		need_reclaim = true;
+		goto blocked;
 	}

-	ret = journal_entry_open(j);
-	if (!ret)
-		closure_wait(&j->async_wait, parent);
 	spin_unlock(&j->lock);

-	if (!ret)
-		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	return 0;
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;

-	return ret;
+	closure_wait(&j->async_wait, cl);
+	spin_unlock(&j->lock);
+
+	if (need_reclaim)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+	return -EAGAIN;
 }

 static int journal_seq_error(struct journal *j, u64 seq)
@ -594,11 +624,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 {
 	struct journal_res res;
-	unsigned u64s = jset_u64s(0);

 	memset(&res, 0, sizeof(res));

-	bch2_journal_res_get(j, &res, u64s, u64s);
+	bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	bch2_journal_res_put(j, &res);

 	bch2_journal_flush_seq_async(j, res.seq, parent);
@ -607,12 +636,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 int bch2_journal_meta(struct journal *j)
 {
 	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
 	int ret;

 	memset(&res, 0, sizeof(res));

-	ret = bch2_journal_res_get(j, &res, u64s, u64s);
+	ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 	if (ret)
 		return ret;

--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@ -272,12 +272,14 @@ static inline void bch2_journal_res_put(struct journal *j,
 }

 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-				 unsigned, unsigned);
+				  unsigned);
+
+#define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
+#define JOURNAL_RES_GET_CHECK		(1 << 1)

 static inline int journal_res_get_fast(struct journal *j,
 				       struct journal_res *res,
-				       unsigned u64s_min,
-				       unsigned u64s_max)
+				       unsigned flags)
 {
 	union journal_res_state old, new;
 	u64 v = atomic64_read(&j->reservations.counter);
@ -289,42 +291,45 @@ static inline int journal_res_get_fast(struct journal *j,
 		 * Check if there is still room in the current journal
 		 * entry:
 		 */
-		if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
 			return 0;

-		res->offset	= old.cur_entry_offset;
-		res->u64s	= min(u64s_max, j->cur_entry_u64s -
-				      old.cur_entry_offset);
+		if (flags & JOURNAL_RES_GET_CHECK)
+			return 1;

-		journal_state_inc(&new);
 		new.cur_entry_offset += res->u64s;
+		journal_state_inc(&new);
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);

-	res->ref = true;
-	res->idx = new.idx;
-	res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+	res->ref	= true;
+	res->idx	= old.idx;
+	res->offset	= old.cur_entry_offset;
+	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
 	return 1;
 }

 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-				      unsigned u64s_min, unsigned u64s_max)
+				       unsigned u64s, unsigned flags)
 {
 	int ret;

 	EBUG_ON(res->ref);
-	EBUG_ON(u64s_max < u64s_min);
 	EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));

-	if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+	res->u64s = u64s;
+
+	if (journal_res_get_fast(j, res, flags))
 		goto out;

-	ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+	ret = bch2_journal_res_get_slowpath(j, res, flags);
 	if (ret)
 		return ret;
 out:
-	lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
-	EBUG_ON(!res->ref);
+	if (!(flags & JOURNAL_RES_GET_CHECK)) {
+		lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+		EBUG_ON(!res->ref);
+	}
 	return 0;
 }