mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-15 16:24:13 +08:00
bcachefs: Improve bch2_btree_iter_traverse_all()
By changing it to upgrade iterators to intent locks to avoid lock restarts we can simplify __bch2_btree_node_lock() quite a bit - this fixes a probable bug where it could potentially drop a lock on an unrelated error but still succeed instead of causing a transaction restart. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
0ef107859b
commit
2527dd9158
@ -268,13 +268,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
|
||||
*/
|
||||
if (type == SIX_LOCK_intent &&
|
||||
linked->nodes_locked != linked->nodes_intent_locked) {
|
||||
linked->locks_want = max_t(unsigned,
|
||||
linked->locks_want,
|
||||
__fls(linked->nodes_locked) + 1);
|
||||
if (!btree_iter_get_locks(linked, true, false)) {
|
||||
deadlock_iter = linked;
|
||||
reason = 1;
|
||||
}
|
||||
deadlock_iter = linked;
|
||||
reason = 1;
|
||||
}
|
||||
|
||||
if (linked->btree_id != iter->btree_id) {
|
||||
@ -303,14 +298,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
|
||||
* we're about to lock, it must have the ancestors locked too:
|
||||
*/
|
||||
if (level > __fls(linked->nodes_locked)) {
|
||||
linked->locks_want =
|
||||
max(level + 1, max_t(unsigned,
|
||||
linked->locks_want,
|
||||
iter->locks_want));
|
||||
if (!btree_iter_get_locks(linked, true, false)) {
|
||||
deadlock_iter = linked;
|
||||
reason = 5;
|
||||
}
|
||||
deadlock_iter = linked;
|
||||
reason = 5;
|
||||
}
|
||||
|
||||
/* Must lock btree nodes in key order: */
|
||||
@ -320,26 +309,17 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
|
||||
deadlock_iter = linked;
|
||||
reason = 7;
|
||||
}
|
||||
|
||||
/*
|
||||
* Recheck if this is a node we already have locked - since one
|
||||
* of the get_locks() calls might've successfully
|
||||
* upgraded/relocked it:
|
||||
*/
|
||||
if (linked->l[level].b == b &&
|
||||
btree_node_locked_type(linked, level) >= type) {
|
||||
six_lock_increment(&b->c.lock, type);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(deadlock_iter)) {
|
||||
trace_trans_restart_would_deadlock(iter->trans->ip, ip,
|
||||
reason,
|
||||
trans->in_traverse_all, reason,
|
||||
deadlock_iter->btree_id,
|
||||
btree_iter_type(deadlock_iter),
|
||||
&deadlock_iter->real_pos,
|
||||
iter->btree_id,
|
||||
btree_iter_type(iter));
|
||||
btree_iter_type(iter),
|
||||
&pos);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -407,29 +387,11 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
|
||||
bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
|
||||
unsigned new_locks_want)
|
||||
{
|
||||
struct btree_iter *linked;
|
||||
|
||||
EBUG_ON(iter->locks_want >= new_locks_want);
|
||||
|
||||
iter->locks_want = new_locks_want;
|
||||
|
||||
if (btree_iter_get_locks(iter, true, true))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Ancestor nodes must be locked before child nodes, so set locks_want
|
||||
* on iterators that might lock ancestors before us to avoid getting
|
||||
* -EINTR later:
|
||||
*/
|
||||
trans_for_each_iter(iter->trans, linked)
|
||||
if (linked != iter &&
|
||||
linked->btree_id == iter->btree_id &&
|
||||
linked->locks_want < new_locks_want) {
|
||||
linked->locks_want = new_locks_want;
|
||||
btree_iter_get_locks(linked, true, false);
|
||||
}
|
||||
|
||||
return false;
|
||||
return btree_iter_get_locks(iter, true, true);
|
||||
}
|
||||
|
||||
void __bch2_btree_iter_downgrade(struct btree_iter *iter,
|
||||
@ -1192,7 +1154,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter *iter;
|
||||
u8 sorted[BTREE_ITER_MAX];
|
||||
unsigned i, nr_sorted = 0;
|
||||
int i, nr_sorted = 0;
|
||||
bool relock_fail;
|
||||
|
||||
if (trans->in_traverse_all)
|
||||
return -EINTR;
|
||||
@ -1200,15 +1163,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
|
||||
trans->in_traverse_all = true;
|
||||
retry_all:
|
||||
nr_sorted = 0;
|
||||
relock_fail = false;
|
||||
|
||||
trans_for_each_iter(trans, iter)
|
||||
trans_for_each_iter(trans, iter) {
|
||||
if (!bch2_btree_iter_relock(iter, true))
|
||||
relock_fail = true;
|
||||
sorted[nr_sorted++] = iter->idx;
|
||||
}
|
||||
|
||||
if (!relock_fail) {
|
||||
trans->in_traverse_all = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define btree_iter_cmp_by_idx(_l, _r) \
|
||||
btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
|
||||
|
||||
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
|
||||
#undef btree_iter_cmp_by_idx
|
||||
|
||||
for (i = nr_sorted - 2; i >= 0; --i) {
|
||||
struct btree_iter *iter1 = trans->iters + sorted[i];
|
||||
struct btree_iter *iter2 = trans->iters + sorted[i + 1];
|
||||
|
||||
if (iter1->btree_id == iter2->btree_id &&
|
||||
iter1->locks_want < iter2->locks_want)
|
||||
__bch2_btree_iter_upgrade(iter1, iter2->locks_want);
|
||||
else if (!iter1->locks_want && iter2->locks_want)
|
||||
__bch2_btree_iter_upgrade(iter1, 1);
|
||||
}
|
||||
|
||||
bch2_trans_unlock(trans);
|
||||
cond_resched();
|
||||
|
||||
@ -1258,6 +1242,8 @@ out:
|
||||
bch2_btree_cache_cannibalize_unlock(c);
|
||||
|
||||
trans->in_traverse_all = false;
|
||||
|
||||
trace_trans_traverse_all(trans->ip);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -2210,7 +2196,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
|
||||
if (!(flags & TRANS_RESET_NOUNLOCK))
|
||||
bch2_trans_cond_resched(trans);
|
||||
|
||||
if (!(flags & TRANS_RESET_NOTRAVERSE))
|
||||
if (!(flags & TRANS_RESET_NOTRAVERSE) &&
|
||||
trans->iters_linked)
|
||||
bch2_btree_iter_traverse_all(trans);
|
||||
}
|
||||
|
||||
|
@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
|
||||
{
|
||||
return cmp_int(l->btree_id, r->btree_id) ?:
|
||||
-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
|
||||
bkey_cmp(l->pos, r->pos);
|
||||
bkey_cmp(l->real_pos, r->real_pos);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -561,43 +561,70 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
|
||||
TRACE_EVENT(trans_restart_would_deadlock,
|
||||
TP_PROTO(unsigned long trans_ip,
|
||||
unsigned long caller_ip,
|
||||
bool in_traverse_all,
|
||||
unsigned reason,
|
||||
enum btree_id have_btree_id,
|
||||
unsigned have_iter_type,
|
||||
struct bpos *have_pos,
|
||||
enum btree_id want_btree_id,
|
||||
unsigned want_iter_type),
|
||||
TP_ARGS(trans_ip, caller_ip, reason,
|
||||
have_btree_id, have_iter_type,
|
||||
want_btree_id, want_iter_type),
|
||||
unsigned want_iter_type,
|
||||
struct bpos *want_pos),
|
||||
TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
|
||||
have_btree_id, have_iter_type, have_pos,
|
||||
want_btree_id, want_iter_type, want_pos),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, trans_ip )
|
||||
__field(unsigned long, caller_ip )
|
||||
__field(u8, in_traverse_all )
|
||||
__field(u8, reason )
|
||||
__field(u8, have_btree_id )
|
||||
__field(u8, have_iter_type )
|
||||
__field(u8, want_btree_id )
|
||||
__field(u8, want_iter_type )
|
||||
|
||||
__field(u64, have_pos_inode )
|
||||
__field(u64, have_pos_offset )
|
||||
__field(u32, have_pos_snapshot)
|
||||
__field(u32, want_pos_snapshot)
|
||||
__field(u64, want_pos_inode )
|
||||
__field(u64, want_pos_offset )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->trans_ip = trans_ip;
|
||||
__entry->caller_ip = caller_ip;
|
||||
__entry->in_traverse_all = in_traverse_all;
|
||||
__entry->reason = reason;
|
||||
__entry->have_btree_id = have_btree_id;
|
||||
__entry->have_iter_type = have_iter_type;
|
||||
__entry->want_btree_id = want_btree_id;
|
||||
__entry->want_iter_type = want_iter_type;
|
||||
|
||||
__entry->have_pos_inode = have_pos->inode;
|
||||
__entry->have_pos_offset = have_pos->offset;
|
||||
__entry->have_pos_snapshot = have_pos->snapshot;
|
||||
|
||||
__entry->want_pos_inode = want_pos->inode;
|
||||
__entry->want_pos_offset = want_pos->offset;
|
||||
__entry->want_pos_snapshot = want_pos->snapshot;
|
||||
),
|
||||
|
||||
TP_printk("%pS %pS because %u have %u:%u want %u:%u",
|
||||
TP_printk("%pS %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
|
||||
(void *) __entry->trans_ip,
|
||||
(void *) __entry->caller_ip,
|
||||
__entry->in_traverse_all,
|
||||
__entry->reason,
|
||||
__entry->have_btree_id,
|
||||
__entry->have_iter_type,
|
||||
__entry->have_pos_inode,
|
||||
__entry->have_pos_offset,
|
||||
__entry->have_pos_snapshot,
|
||||
__entry->want_btree_id,
|
||||
__entry->want_iter_type)
|
||||
__entry->want_iter_type,
|
||||
__entry->want_pos_inode,
|
||||
__entry->want_pos_offset,
|
||||
__entry->want_pos_snapshot)
|
||||
);
|
||||
|
||||
TRACE_EVENT(trans_restart_iters_realloced,
|
||||
@ -689,6 +716,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse,
|
||||
TP_ARGS(ip)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(transaction_restart, trans_traverse_all,
|
||||
TP_PROTO(unsigned long ip),
|
||||
TP_ARGS(ip)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(node_lock_fail,
|
||||
TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
|
||||
TP_ARGS(level, iter_seq, node, node_seq),
|
||||
|
Loading…
Reference in New Issue
Block a user