From 31403dca5bb1e55ea0ea6ad1264b81fa8c9a3768 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 11:11:22 -0500 Subject: [PATCH] bcachefs: optimize __bch2_trans_get(), kill DEBUG_TRANSACTIONS - Some tweaks to greatly reduce locking overhead for the list of btree transactions, so that it can always be enabled: leave btree_trans objects on the list when they're on the percpu single item freelist, and only check for duplicates in the same process when CONFIG_BCACHEFS_DEBUG is enabled - don't zero out the full btree_trans() unless we allocated it from the mempool Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 8 -- fs/bcachefs/btree_iter.c | 166 ++++++++++++++++++------------------ fs/bcachefs/btree_locking.c | 3 +- fs/bcachefs/btree_types.h | 11 ++- fs/bcachefs/debug.c | 18 ++-- 5 files changed, 100 insertions(+), 106 deletions(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 7d3ac9556c98..5cdfef3b551a 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL depends on BCACHEFS_FS select FS_POSIX_ACL -config BCACHEFS_DEBUG_TRANSACTIONS - bool "bcachefs runtime info" - depends on BCACHEFS_FS - help - This makes the list of running btree transactions available in debugfs. - - This is a highly useful debugging feature but does add a small amount of overhead. - config BCACHEFS_DEBUG bool "bcachefs debugging" depends on BCACHEFS_FS diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 818676b97436..c35a262f6081 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2714,6 +2714,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { + struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; unsigned old_bytes = trans->mem_bytes; unsigned new_bytes = roundup_pow_of_two(new_top); @@ -2721,17 +2722,19 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + struct btree_transaction_stats *s = btree_trans_stats(trans); + if (s) + s->max_mem = max(s->max_mem, new_bytes); + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; kfree(trans->mem); } @@ -2751,7 +2754,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } @@ -2860,25 +2863,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) - return trans; - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - /* - * paths need to be zeroed, bch2_check_for_deadlock looks at - * paths in other threads - */ - memset(&trans->paths, 0, sizeof(trans->paths)); - return trans; -} - const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; unsigned bch2_trans_get_fn_idx(const char *fn) @@ -2900,22 +2884,55 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { struct btree_trans *trans; - struct btree_transaction_stats *s; - trans = bch2_trans_alloc(c); + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) { + memset(trans, 0, offsetof(struct btree_trans, updates)); + goto got_trans; + } + } + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); memset(trans, 0, sizeof(*trans)); + closure_init_stack(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct btree_trans *pos; + pid_t pid = current->pid; + + trans->locking_wait.task = current; + + list_for_each_entry(pos, &c->btree_trans_list, list) { + struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); + /* + * We'd much prefer to be stricter here and completely + * disallow multiple btree_trans in the same thread - + * but the data move path calls bch2_write when we + * already have a btree_trans initialized. + */ + BUG_ON(pos_task && + pid == pos_task->pid && + bch2_trans_locked(pos)); + + if (pos_task && pid < pos_task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } + } + } + list_add_tail(&trans->list, &c->btree_trans_list); +list_add_done: + seqmutex_unlock(&c->btree_trans_lock); +got_trans: trans->c = c; - trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) - ? bch2_btree_transaction_fns[fn_idx] : NULL; trans->last_begin_time = local_clock(); trans->fn_idx = fn_idx; trans->locking_wait.task = current; trans->journal_replay_not_finished = unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && atomic_inc_not_zero(&c->journal_keys.ref); - closure_init_stack(&trans->ref); - trans->paths_allocated = trans->_paths_allocated; trans->sorted = trans->_sorted; trans->paths = trans->_paths; @@ -2924,21 +2941,19 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->paths_allocated[0] = 1; - s = btree_trans_stats(trans); - if (s && s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + if (fn_idx < BCH_TRANSACTIONS_NR) { + trans->fn = bch2_btree_transaction_fns[fn_idx]; - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; - if (!unlikely(trans->mem)) { - trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - trans->mem_bytes = BTREE_TRANS_MEM_MAX; - } else { - trans->mem_bytes = expected_mem_bytes; + if (s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + if (likely(trans->mem)) + trans->mem_bytes = expected_mem_bytes; } - } - if (s) { trans->nr_paths_max = s->nr_max_paths; trans->journal_entries_size = s->journal_entries_size; } @@ -2946,31 +2961,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; - - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { - struct btree_trans *pos; - - seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(pos, &c->btree_trans_list, list) { - /* - * We'd much prefer to be stricter here and completely - * disallow multiple btree_trans in the same thread - - * but the data move path calls bch2_write when we - * already have a btree_trans initialized. - */ - BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && - bch2_trans_locked(pos)); - - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; - } - } - list_add_tail(&trans->list, &c->btree_trans_list); -list_add_done: - seqmutex_unlock(&c->btree_trans_lock); - } - return trans; } @@ -3001,24 +2991,13 @@ void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { struct bch_fs *c = trans->c; - struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - - closure_sync(&trans->ref); - - if (s) - s->max_mem = max(s->max_mem, trans->mem_max); - trans_for_each_update(trans, i) __btree_path_put(trans->paths + i->path, true); - trans->nr_updates = 0; + trans->nr_updates = 0; + trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -3047,8 +3026,16 @@ void bch2_trans_put(struct btree_trans *trans) /* Userspace doesn't have a real percpu implementation: */ if (IS_ENABLED(__KERNEL__)) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - if (trans) + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + mempool_free(trans, &c->btree_trans_pool); + } } static void __maybe_unused @@ -3146,15 +3133,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) struct btree_trans *trans; int cpu; + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) { + struct btree_trans *trans = + per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + kfree(trans); + } + free_percpu(c->btree_trans_bufs); + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); if (trans) panic("%s leaked btree_trans\n", trans->fn); - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); - free_percpu(c->btree_trans_bufs); - for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index d68e1211029d..1ed8327a9fa2 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -95,9 +95,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { + struct task_struct *task = i->trans->locking_wait.task; if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + prt_printf(out, "%u ", task ?task->pid : 0); } prt_newline(out); } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 28bac4ce20e1..3baf688177c4 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -386,7 +386,6 @@ struct btree_trans { void *mem; unsigned mem_top; - unsigned mem_max; unsigned mem_bytes; btree_path_idx_t nr_sorted; @@ -413,8 +412,6 @@ struct btree_trans { unsigned long srcu_lock_time; const char *fn; - struct closure ref; - struct list_head list; struct btree_bkey_cached_common *locking; struct six_lock_waiter locking_wait; int srcu_idx; @@ -424,7 +421,6 @@ struct btree_trans { u16 journal_entries_size; struct jset_entry *journal_entries; - struct btree_insert_entry updates[BTREE_ITER_MAX]; struct btree_trans_commit_hook *hooks; struct journal_entry_pin *journal_pin; @@ -435,6 +431,13 @@ struct btree_trans { unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; + /* Entries before this are zeroed out on every bch2_trans_get() call */ + + struct btree_insert_entry updates[BTREE_ITER_MAX]; + + struct list_head list; + struct closure ref; + unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; struct btree_trans_paths trans_paths; struct btree_path _paths[BTREE_ITER_MAX]; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index bbf6fce3a789..c0b4d9057f29 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -592,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -608,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -626,11 +627,11 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&i->buf, task); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -654,7 +655,6 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; -#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -811,7 +811,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -826,7 +828,7 @@ restart: bch2_check_for_deadlock(trans, &i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -873,10 +875,8 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); -#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops);