bcachefs: Really don't hold btree locks while btree IOs are in flight

This is something we've attempted to stick to for quite some time, as it
helps guarantee filesystem latency - but there's a few remaining paths
that this patch fixes.

This is also necessary for an upcoming patch to update btree pointers
after every btree write - since the btree write completion path will now
be doing btree operations.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-07-10 23:03:15 -04:00 committed by Kent Overstreet
parent e3a67bdb6e
commit 19d5432445
5 changed files with 116 additions and 62 deletions

View File

@ -187,6 +187,17 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
if (!flush)
return -ENOMEM;
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
if (!six_trylock_intent(&b->c.lock))
return -ENOMEM;
@ -194,25 +205,26 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
if (!six_trylock_write(&b->c.lock))
goto out_unlock_intent;
/* recheck under lock */
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
if (!flush)
goto out_unlock;
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
if (btree_node_noevict(b))
goto out_unlock;
if (!btree_node_may_write(b))
goto out_unlock;
if (btree_node_dirty(b) &&
test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
goto out_unlock;
if (btree_node_dirty(b) ||
btree_node_write_in_flight(b) ||
btree_node_read_in_flight(b)) {
if (!flush)
if (btree_node_dirty(b)) {
if (!flush ||
test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
goto out_unlock;
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
@ -224,8 +236,9 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
else
__bch2_btree_node_write(c, b);
/* wait for any in flight btree write */
btree_node_wait_on_io(b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
out:
if (b->hash_val && !ret)
@ -581,6 +594,7 @@ got_node:
}
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_write_in_flight(b));
out:
b->flags = 0;
@ -634,6 +648,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
u32 seq;
BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
/*
@ -663,31 +678,31 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
return NULL;
}
set_btree_node_read_in_flight(b);
six_unlock_write(&b->c.lock);
seq = b->c.lock.state.seq;
six_unlock_intent(&b->c.lock);
/* Unlock before doing IO: */
if (iter && sync)
bch2_trans_unlock(iter->trans);
bch2_btree_node_read(c, b, sync);
six_unlock_write(&b->c.lock);
if (!sync) {
six_unlock_intent(&b->c.lock);
if (!sync)
return NULL;
}
/*
* XXX: this will probably always fail because btree_iter_relock()
* currently fails for iterators that aren't pointed at a valid btree
* node
*/
if (iter && !bch2_trans_relock(iter->trans)) {
six_unlock_intent(&b->c.lock);
if (iter && !bch2_trans_relock(iter->trans))
return ERR_PTR(-EINTR);
}
if (lock_type == SIX_LOCK_read)
six_lock_downgrade(&b->c.lock);
if (!six_relock_type(&b->c.lock, lock_type, seq))
return ERR_PTR(-EINTR);
return b;
}
@ -831,11 +846,12 @@ lock_node:
}
if (unlikely(btree_node_read_in_flight(b))) {
u32 seq = b->c.lock.state.seq;
six_unlock_type(&b->c.lock, lock_type);
bch2_trans_unlock(iter->trans);
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
bch2_btree_node_wait_on_read(b);
/*
* XXX: check if this always fails - btree_iter_relock()
@ -844,7 +860,9 @@ lock_node:
*/
if (iter && !bch2_trans_relock(iter->trans))
return ERR_PTR(-EINTR);
goto retry;
if (!six_relock_type(&b->c.lock, lock_type, seq))
goto retry;
}
prefetch(b->aux_data);
@ -923,8 +941,7 @@ lock_node:
}
/* XXX: waiting on IO with btree locks held: */
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
__bch2_btree_node_wait_on_read(b);
prefetch(b->aux_data);
@ -979,16 +996,24 @@ void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
b = btree_cache_find(bc, k);
if (!b)
return;
wait_on_io:
/* not allowed to wait on io with btree locks held: */
/* XXX we're called from btree_gc which will be holding other btree
* nodes locked
* */
__bch2_btree_node_wait_on_read(b);
__bch2_btree_node_wait_on_write(b);
six_lock_intent(&b->c.lock, NULL, NULL);
six_lock_write(&b->c.lock, NULL, NULL);
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
__bch2_btree_node_write(c, b);
/* wait for any in flight btree write */
btree_node_wait_on_io(b);
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;
}
BUG_ON(btree_node_dirty(b));

View File

@ -22,6 +22,44 @@
#include <linux/sched/mm.h>
void bch2_btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
clear_btree_node_write_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
void bch2_btree_node_io_lock(struct btree *b)
{
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void __bch2_btree_node_wait_on_read(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void __bch2_btree_node_wait_on_write(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_read(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_wait_on_write(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
struct bkey_packed *end)
@ -432,7 +470,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
EBUG_ON(iter && iter->l[b->c.level].b != b);
BUG_ON(bset_written(b, bset(b, &b->set[1])));
if (b->nsets == MAX_BSETS) {
if (b->nsets == MAX_BSETS &&
!btree_node_write_in_flight(b)) {
unsigned log_u64s[] = {
ilog2(bset_u64s(&b->set[0])),
ilog2(bset_u64s(&b->set[1])),
@ -1402,8 +1441,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
btree_pos_to_text(&PBUF(buf), c, b);
trace_btree_read(c, b);
set_btree_node_read_in_flight(b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
return;
@ -1480,6 +1517,8 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
bkey_copy(&b->key, k);
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
set_btree_node_read_in_flight(b);
bch2_btree_node_read(c, b, true);
if (btree_node_read_error(b)) {
@ -1525,7 +1564,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_write *w = btree_prev_write(b);
bch2_btree_complete_write(c, b, w);
btree_node_io_unlock(b);
bch2_btree_node_io_unlock(b);
}
static void bch2_btree_node_write_error(struct bch_fs *c,
@ -1707,6 +1746,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
bool validate_before_checksum = false;
void *data;
BUG_ON(btree_node_write_in_flight(b));
if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
return;
@ -1734,7 +1775,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
* XXX waiting on btree writes with btree locks held -
* this can deadlock, and we hit the write error path
*/
btree_node_wait_on_io(b);
bch2_btree_node_wait_on_write(b);
continue;
}

View File

@ -52,24 +52,12 @@ struct btree_write_bio {
struct bch_write_bio wbio;
};
static inline void btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
clear_btree_node_write_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
static inline void btree_node_io_lock(struct btree *b)
{
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static inline void btree_node_wait_on_io(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
void bch2_btree_node_io_unlock(struct btree *);
void bch2_btree_node_io_lock(struct btree *);
void __bch2_btree_node_wait_on_read(struct btree *);
void __bch2_btree_node_wait_on_write(struct btree *);
void bch2_btree_node_wait_on_read(struct btree *);
void bch2_btree_node_wait_on_write(struct btree *);
static inline bool btree_node_may_write(struct btree *b)
{
@ -169,7 +157,7 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
}
six_unlock_type(&b->c.lock, lock_held);
btree_node_wait_on_io(b);
bch2_btree_node_wait_on_write(b);
btree_node_lock_type(c, b, lock_held);
}
}

View File

@ -567,7 +567,7 @@ static void btree_update_nodes_written(struct btree_update *as)
six_unlock_read(&old->c.lock);
if (seq == as->old_nodes_seq[i])
btree_node_wait_on_io(old);
bch2_btree_node_wait_on_write(old);
}
/*

View File

@ -133,7 +133,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
if (c->opts.nochanges)
return;
btree_node_io_lock(b);
bch2_btree_node_io_lock(b);
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
@ -176,7 +176,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
}
out:
mutex_unlock(&c->verify_lock);
btree_node_io_unlock(b);
bch2_btree_node_io_unlock(b);
}
#ifdef CONFIG_DEBUG_FS