Merge branch 'for-next' of git://evilpiepirate.org/bcachefs.git

This commit is contained in:
Stephen Rothwell 2024-08-29 08:13:55 +10:00
commit 7d0f928e8b
69 changed files with 2151 additions and 1181 deletions

View File

@ -175,7 +175,7 @@ errors in our thinking by running our code and seeing what happens. If your
time is being wasted because your tools are bad or too slow - don't accept it,
fix it.
Put effort into your documentation, commmit messages, and code comments - but
Put effort into your documentation, commit messages, and code comments - but
don't go overboard. A good commit message is wonderful - but if the information
was important enough to go in a commit message, ask yourself if it would be
even better as a code comment.

View File

@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
is held by another thread, spin for a short while, as long as the
thread owning the lock is running.
config BCACHEFS_PATH_TRACEPOINTS
bool "Extra btree_path tracepoints"
depends on BCACHEFS_FS
help
Enable extra tracepoints for debugging btree_path operations; we don't
normally want these enabled because they happen at very high rates.
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT

View File

@ -69,6 +69,7 @@ bcachefs-y := \
printbuf.o \
quota.o \
rebalance.o \
rcu_pending.o \
recovery.o \
recovery_passes.o \
reflink.o \

View File

@ -361,7 +361,7 @@ retry:
bch2_trans_begin(trans);
acl = _acl;
ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
BTREE_ITER_intent);
if (ret)

View File

@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
#include <linux/jiffies.h>
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
* freespace/need_discard/need_gc_gens btrees as needed:
*/
while (1) {
if (last_updated + HZ * 10 < jiffies) {
if (time_after(jiffies, last_updated + HZ * 10)) {
bch_info(ca, "%s: currently at %llu/%llu",
__func__, iter.pos.offset, ca->mi.nbuckets);
last_updated = jiffies;

View File

@ -1022,9 +1022,6 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
if (erasure_code && ec_open_bucket(c, ptrs))
return 0;
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
nr_replicas, nr_effective,
have_cache, erasure_code, flags);
@ -1079,7 +1076,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
{
int ret;
if (erasure_code) {
if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
ret = __open_bucket_add_buckets(trans, ptrs, wp,
devs_have, target, erasure_code,
nr_replicas, nr_effective, have_cache,

View File

@ -542,7 +542,7 @@ struct bch_dev {
* gc_gens_lock, for device resize - holding any is sufficient for
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
*/
struct bucket_array __rcu *buckets_gc;
GENRADIX(struct bucket) buckets_gc;
struct bucket_gens __rcu *bucket_gens;
u8 *oldest_gen;
unsigned long *buckets_nouse;
@ -1023,6 +1023,7 @@ struct bch_fs {
/* fs.c */
struct list_head vfs_inodes_list;
struct mutex vfs_inodes_lock;
struct rhashtable vfs_inodes_table;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
@ -1085,7 +1086,6 @@ struct bch_fs {
u64 __percpu *counters;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
struct bch2_time_stats times[BCH_TIME_STAT_NR];

View File

@ -795,6 +795,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
struct bch_sb, flags[0], 63, 64);
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);

View File

@ -304,11 +304,6 @@ struct bkey_float {
};
#define BKEY_MANTISSA_BITS 16
static unsigned bkey_float_byte_offset(unsigned idx)
{
return idx * sizeof(struct bkey_float);
}
struct ro_aux_tree {
u8 nothing[0];
struct bkey_float f[];
@ -328,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
return t->aux_data_offset;
case BSET_RO_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
t->size * sizeof(u8), 8);
DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
case BSET_RW_AUX_TREE:
return t->aux_data_offset +
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
@ -360,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
return __aux_tree_base(b, t);
}
static u8 *ro_aux_tree_prev(const struct btree *b,
const struct bset_tree *t)
{
EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
}
static struct bkey_float *bkey_float(const struct btree *b,
const struct bset_tree *t,
unsigned idx)
@ -479,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
bkey_float(b, t, j)->key_offset);
}
static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
const struct bset_tree *t,
unsigned j)
{
unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
}
static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
const struct bset_tree *t)
{
@ -585,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
}
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
const struct bkey_float *f,
unsigned idx)
const struct bkey_float *f)
{
u64 v;
@ -617,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
struct bkey_packed *m = tree_to_bkey(b, t, j);
struct bkey_packed *l = is_power_of_2(j)
? min_key
: tree_to_prev_bkey(b, t, j >> ffs(j));
: tree_to_bkey(b, t, j >> ffs(j));
struct bkey_packed *r = is_power_of_2(j + 1)
? max_key
: tree_to_bkey(b, t, j >> (ffz(j) + 1));
@ -668,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
f->exponent = shift;
mantissa = bkey_mantissa(m, f, j);
mantissa = bkey_mantissa(m, f);
/*
* If we've got garbage bits, set them to all 1s - it's legal for the
@ -690,8 +666,7 @@ static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) /
(sizeof(struct bkey_float) + sizeof(u8));
return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
}
static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
@ -720,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_packed *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
unsigned cacheline = 1;
@ -733,12 +708,12 @@ retry:
return;
}
t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
t->extra = eytzinger1_extra(t->size - 1);
/* First we figure out where the first key in each cacheline is */
eytzinger1_for_each(j, t->size - 1) {
while (bkey_to_cacheline(b, t, k) < cacheline)
prev = k, k = bkey_p_next(k);
k = bkey_p_next(k);
if (k >= btree_bkey_last(b, t)) {
/* XXX: this path sucks */
@ -746,17 +721,12 @@ retry:
goto retry;
}
ro_aux_tree_prev(b, t)[j] = prev->u64s;
bkey_float(b, t, j)->key_offset =
bkey_to_cacheline_offset(b, t, cacheline++, k);
EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
EBUG_ON(tree_to_bkey(b, t, j) != k);
}
while (k != btree_bkey_last(b, t))
prev = k, k = bkey_p_next(k);
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
bkey_init(&min_key.k);
min_key.k.p = b->data->min_key;
@ -915,66 +885,18 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
/* Insert */
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
unsigned clobber_u64s,
unsigned new_u64s)
static void rw_aux_tree_insert_entry(struct btree *b,
struct bset_tree *t,
unsigned idx)
{
int shift = new_u64s - clobber_u64s;
unsigned l, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
/* returns first entry >= where */
l = rw_aux_tree_bsearch(b, t, where);
if (!l) /* never delete first entry */
l++;
else if (l < t->size &&
where < t->end_offset &&
rw_aux_tree(b, t)[l].offset == where)
rw_aux_tree_set(b, t, l++, _where);
/* l now > where */
for (j = l;
j < t->size &&
rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
j++)
;
if (j < t->size &&
rw_aux_tree(b, t)[j].offset + shift ==
rw_aux_tree(b, t)[l - 1].offset)
j++;
memmove(&rw_aux_tree(b, t)[l],
&rw_aux_tree(b, t)[j],
(void *) &rw_aux_tree(b, t)[t->size] -
(void *) &rw_aux_tree(b, t)[j]);
t->size -= j - l;
for (j = l; j < t->size; j++)
rw_aux_tree(b, t)[j].offset += shift;
EBUG_ON(l < t->size &&
rw_aux_tree(b, t)[l].offset ==
rw_aux_tree(b, t)[l - 1].offset);
EBUG_ON(!idx || idx > t->size);
struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
struct bkey_packed *end = idx < t->size
? rw_aux_to_bkey(b, t, idx)
: btree_bkey_last(b, t);
if (t->size < bset_rw_tree_capacity(b, t) &&
(l < t->size
? rw_aux_tree(b, t)[l].offset
: t->end_offset) -
rw_aux_tree(b, t)[l - 1].offset >
L1_CACHE_BYTES / sizeof(u64)) {
struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
struct bkey_packed *end = l < t->size
? rw_aux_to_bkey(b, t, l)
: btree_bkey_last(b, t);
(void *) end - (void *) start > L1_CACHE_BYTES) {
struct bkey_packed *k = start;
while (1) {
@ -983,23 +905,78 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
break;
if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
memmove(&rw_aux_tree(b, t)[l + 1],
&rw_aux_tree(b, t)[l],
memmove(&rw_aux_tree(b, t)[idx + 1],
&rw_aux_tree(b, t)[idx],
(void *) &rw_aux_tree(b, t)[t->size] -
(void *) &rw_aux_tree(b, t)[l]);
(void *) &rw_aux_tree(b, t)[idx]);
t->size++;
rw_aux_tree_set(b, t, l, k);
rw_aux_tree_set(b, t, idx, k);
break;
}
}
}
}
static void bch2_bset_fix_lookup_table(struct btree *b,
struct bset_tree *t,
struct bkey_packed *_where,
unsigned clobber_u64s,
unsigned new_u64s)
{
int shift = new_u64s - clobber_u64s;
unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
EBUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
return;
if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
rw_aux_tree_insert_entry(b, t, t->size);
goto verify;
}
/* returns first entry >= where */
idx = rw_aux_tree_bsearch(b, t, where);
if (rw_aux_tree(b, t)[idx].offset == where) {
if (!idx) { /* never delete first entry */
idx++;
} else if (where < t->end_offset) {
rw_aux_tree_set(b, t, idx++, _where);
} else {
EBUG_ON(where != t->end_offset);
rw_aux_tree_insert_entry(b, t, --t->size);
goto verify;
}
}
EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
if (idx < t->size &&
rw_aux_tree(b, t)[idx].offset + shift ==
rw_aux_tree(b, t)[idx - 1].offset) {
memmove(&rw_aux_tree(b, t)[idx],
&rw_aux_tree(b, t)[idx + 1],
(void *) &rw_aux_tree(b, t)[t->size] -
(void *) &rw_aux_tree(b, t)[idx + 1]);
t->size -= 1;
}
for (j = idx; j < t->size; j++)
rw_aux_tree(b, t)[j].offset += shift;
EBUG_ON(idx < t->size &&
rw_aux_tree(b, t)[idx].offset ==
rw_aux_tree(b, t)[idx - 1].offset);
rw_aux_tree_insert_entry(b, t, idx);
verify:
bch2_bset_verify_rw_aux_tree(b, t);
bset_aux_tree_verify(b);
}
void bch2_bset_insert(struct btree *b,
struct btree_node_iter *iter,
struct bkey_packed *where,
struct bkey_i *insert,
unsigned clobber_u64s)
@ -1098,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p)
}
static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
const struct bkey_float *f,
unsigned idx)
const struct bkey_float *f)
{
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
@ -1133,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
goto slowpath;
l = f->mantissa;
r = bkey_mantissa(packed_search, f, n);
r = bkey_mantissa(packed_search, f);
if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
goto slowpath;
n = n * 2 + (l < r);

View File

@ -270,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
struct bkey_packed *, struct bkey_i *, unsigned);
void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
unsigned);
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
/* Bkey utility code */

View File

@ -671,9 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
: &bc->freed_nonpcpu;
struct btree *b, *b2;
u64 start_time = local_clock();
unsigned flags;
flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
/*
@ -745,8 +743,6 @@ out:
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
memalloc_nofs_restore(flags);
int ret = bch2_trans_relock(trans);
if (unlikely(ret)) {
bch2_btree_node_to_freelist(c, b);
@ -781,7 +777,6 @@ err:
}
mutex_unlock(&bc->lock);
memalloc_nofs_restore(flags);
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
}

View File

@ -753,10 +753,8 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
}
for_each_member_device(c, ca)
genradix_free(&ca->buckets_gc);
}
static int bch2_gc_start(struct bch_fs *c)
@ -910,20 +908,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
int ret = 0;
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
if (ret) {
bch2_dev_put(ca);
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
break;
}
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
buckets->nbuckets_minus_first =
buckets->nbuckets - buckets->first_bucket;
rcu_assign_pointer(ca->buckets_gc, buckets);
}
bch_err_fn(c, ret);

View File

@ -1010,9 +1010,9 @@ retry_all:
* the same position:
*/
if (trans->paths[idx].uptodate) {
__btree_path_get(&trans->paths[idx], false);
__btree_path_get(trans, &trans->paths[idx], false);
ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
__btree_path_put(&trans->paths[idx], false);
__btree_path_put(trans, &trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@ -1131,6 +1131,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
if (unlikely(!trans->srcu_held))
bch2_trans_srcu_lock(trans);
trace_btree_path_traverse_start(trans, path);
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
@ -1194,6 +1196,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
trace_btree_path_traverse_end(trans, path);
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
panic("ret %s (%i) trans->restarted %s (%i)\n",
@ -1225,7 +1228,7 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i
{
btree_path_idx_t new = btree_path_alloc(trans, src);
btree_path_copy(trans, trans->paths + new, trans->paths + src);
__btree_path_get(trans->paths + new, intent);
__btree_path_get(trans, trans->paths + new, intent);
#ifdef TRACK_PATH_ALLOCATED
trans->paths[new].ip_allocated = ip;
#endif
@ -1236,8 +1239,10 @@ __flatten
btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
btree_path_idx_t path, bool intent, unsigned long ip)
{
__btree_path_put(trans->paths + path, intent);
struct btree_path *old = trans->paths + path;
__btree_path_put(trans, trans->paths + path, intent);
path = btree_path_clone(trans, path, intent, ip);
trace_btree_path_clone(trans, old, trans->paths + path);
trans->paths[path].preserve = false;
return path;
}
@ -1252,6 +1257,8 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
bch2_trans_verify_not_in_restart(trans);
EBUG_ON(!trans->paths[path_idx].ref);
trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
struct btree_path *path = trans->paths + path_idx;
@ -1361,13 +1368,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
{
struct btree_path *path = trans->paths + path_idx, *dup;
if (!__btree_path_put(path, intent))
if (!__btree_path_put(trans, path, intent))
return;
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
trace_btree_path_free(trans, path_idx, dup);
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
return;
@ -1392,7 +1401,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
if (!__btree_path_put(trans->paths + path, intent))
if (!__btree_path_put(trans, trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@ -1421,8 +1430,8 @@ void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
prt_printf(buf, "transaction updates for %s journal seq %llu\n",
trans->fn, trans->journal_res.seq);
prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
trans->nr_updates, trans->fn, trans->journal_res.seq);
printbuf_indent_add(buf, 2);
trans_for_each_update(trans, i) {
@ -1464,7 +1473,7 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
{
struct btree_path *path = trans->paths + path_idx;
prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
@ -1716,14 +1725,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
trans->paths[path_pos].level == level) {
__btree_path_get(trans->paths + path_pos, intent);
trace_btree_path_get(trans, trans->paths + path_pos, &pos);
__btree_path_get(trans, trans->paths + path_pos, intent);
path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
path = trans->paths + path_idx;
} else {
path_idx = btree_path_alloc(trans, path_pos);
path = trans->paths + path_idx;
__btree_path_get(path, intent);
__btree_path_get(trans, path, intent);
path->pos = pos;
path->btree_id = btree_id;
path->cached = cached;
@ -1738,6 +1749,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
path->ip_allocated = ip;
#endif
trans->paths_sorted = false;
trace_btree_path_alloc(trans, path);
}
if (!(flags & BTREE_ITER_nopreserve))
@ -1857,7 +1870,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
struct btree_path *path = btree_iter_path(trans, iter);
if (btree_path_node(path, path->level))
btree_path_set_should_be_locked(path);
btree_path_set_should_be_locked(trans, path);
return 0;
}
@ -1889,7 +1902,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@ -1983,7 +1996,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
@ -2155,7 +2168,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (unlikely(ret))
return bkey_s_c_err(ret);
btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (k.k && !bkey_err(k)) {
@ -2199,7 +2212,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
btree_path_set_should_be_locked(path);
btree_path_set_should_be_locked(trans, path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
@ -2326,7 +2339,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
__btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
@ -2382,14 +2395,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
btree_path_set_should_be_locked(trans->paths + iter->update_path);
btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
}
if (!(iter->flags & BTREE_ITER_all_snapshots))
@ -2511,6 +2524,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
path = btree_iter_path(trans, iter);
trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
saved_k = *k.k;
saved_v = k.v;
}
@ -2527,7 +2541,7 @@ got_key:
continue;
}
btree_path_set_should_be_locked(path);
btree_path_set_should_be_locked(trans, path);
break;
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
@ -2685,7 +2699,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@ -2712,6 +2726,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
return bch2_btree_iter_peek_slot(iter);
}
/* Obsolete, but still used by rust wrapper in -tools */
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
{
struct bkey_s_c k;
@ -2911,9 +2926,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->ip_allocated = _RET_IP_;
#endif
if (src->path)
__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
__btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
if (src->update_path)
__btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
__btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
dst->key_cache_path = 0;
}
@ -3237,7 +3252,7 @@ void bch2_trans_put(struct btree_trans *trans)
bch2_trans_unlock(trans);
trans_for_each_update(trans, i)
__btree_path_put(trans->paths + i->path, true);
__btree_path_put(trans, trans->paths + i->path, true);
trans->nr_updates = 0;
check_btree_paths_leaked(trans);

View File

@ -6,6 +6,12 @@
#include "btree_types.h"
#include "trace.h"
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
static inline int __bkey_err(const struct bkey *k)
{
return PTR_ERR_OR_ZERO(k);
@ -13,16 +19,28 @@ static inline int __bkey_err(const struct bkey *k)
#define bkey_err(_k) __bkey_err((_k).k)
static inline void __btree_path_get(struct btree_path *path, bool intent)
static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
{
unsigned idx = path - trans->paths;
EBUG_ON(!test_bit(idx, trans->paths_allocated));
if (unlikely(path->ref == U8_MAX)) {
bch2_dump_trans_paths_updates(trans);
panic("path %u refcount overflow\n", idx);
}
path->ref++;
path->intent_ref += intent;
trace_btree_path_get_ll(trans, path);
}
static inline bool __btree_path_put(struct btree_path *path, bool intent)
static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
{
EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
EBUG_ON(!path->ref);
EBUG_ON(!path->intent_ref && intent);
trace_btree_path_put_ll(trans, path);
path->intent_ref -= intent;
return --path->ref == 0;
}
@ -814,20 +832,6 @@ transaction_restart: \
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
static inline struct bkey_s_c
__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
struct btree_iter *iter, unsigned flags)
{
struct bkey_s_c k;
while (btree_trans_too_many_iters(trans) ||
(k = bch2_btree_iter_peek_type(iter, flags),
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
bch2_trans_begin(trans);
return k;
}
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
@ -868,7 +872,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
\
if (bch2_err_matches(_ret, ENOMEM)) { \
_gfp = GFP_KERNEL; \
_ret = drop_locks_do(trans, _do); \
_ret = drop_locks_do(_trans, _do); \
} \
_ret; \
})
@ -881,7 +885,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret = 0; \
if (unlikely(!_p)) { \
_gfp = GFP_KERNEL; \
_ret = drop_locks_do(trans, ((_p = _do), 0)); \
_ret = drop_locks_do(_trans, ((_p = _do), 0)); \
} \
_p; \
})
@ -894,12 +898,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
_ret; \
})
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);

View File

@ -79,130 +79,41 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
return true;
}
static void bkey_cached_evict(struct btree_key_cache *c,
static bool bkey_cached_evict(struct btree_key_cache *c,
struct bkey_cached *ck)
{
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params));
memset(&ck->key, ~0, sizeof(ck->key));
bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
bch2_btree_key_cache_params);
if (ret) {
memset(&ck->key, ~0, sizeof(ck->key));
atomic_long_dec(&c->nr_keys);
}
atomic_long_dec(&c->nr_keys);
return ret;
}
static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
{
struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
this_cpu_dec(*c->btree_key_cache.nr_pending);
kmem_cache_free(bch2_key_cache, ck);
}
static void bkey_cached_free(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
} else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
}
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
#ifdef __KERNEL__
static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bkey_cached *pos;
bc->nr_freed_nonpcpu++;
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
list_move(&ck->list, &pos->list);
return;
}
}
list_move(&ck->list, &bc->freed_nonpcpu);
}
#endif
static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
if (!ck->c.lock.readers) {
#ifdef __KERNEL__
struct btree_key_cache_freelist *f;
bool freed = false;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr < ARRAY_SIZE(f->objs)) {
f->objs[f->nr++] = ck;
freed = true;
}
preempt_enable();
if (!freed) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
struct bkey_cached *ck2 = f->objs[--f->nr];
__bkey_cached_move_to_freelist_ordered(bc, ck2);
}
preempt_enable();
__bkey_cached_move_to_freelist_ordered(bc, ck);
mutex_unlock(&bc->lock);
}
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
static void bkey_cached_free_fast(struct btree_key_cache *bc,
struct bkey_cached *ck)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
list_del_init(&ck->list);
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
ck->k = NULL;
ck->u64s = 0;
bkey_cached_move_to_freelist(bc, ck);
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
bool pcpu_readers = ck->c.lock.readers != NULL;
rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
this_cpu_inc(*bc->nr_pending);
}
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
@ -224,74 +135,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
{
struct bch_fs *c = trans->c;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bkey_cached *ck = NULL;
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
int ret;
if (!pcpu_readers) {
#ifdef __KERNEL__
struct btree_key_cache_freelist *f;
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
if (f->nr)
ck = f->objs[--f->nr];
preempt_enable();
if (!ck) {
mutex_lock(&bc->lock);
preempt_disable();
f = this_cpu_ptr(bc->pcpu_freed);
while (!list_empty(&bc->freed_nonpcpu) &&
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
ck = f->nr ? f->objs[--f->nr] : NULL;
preempt_enable();
mutex_unlock(&bc->lock);
}
#else
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
} else {
mutex_lock(&bc->lock);
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
if (ck) {
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
if (unlikely(ret)) {
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
if (unlikely(ret)) {
btree_node_unlock(trans, path, 0);
bkey_cached_move_to_freelist(bc, ck);
return ERR_PTR(ret);
}
return ck;
}
struct bkey_cached *ck = container_of_or_null(
rcu_pending_dequeue(&bc->pending[pcpu_readers]),
struct bkey_cached, rcu);
if (ck)
goto lock;
ck = allocate_dropping_locks(trans, ret,
__bkey_cached_alloc(key_u64s, _gfp));
@ -302,15 +153,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
return ERR_PTR(ret);
}
if (!ck)
return NULL;
if (ck) {
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
ck->c.cached = true;
goto lock;
}
INIT_LIST_HEAD(&ck->list);
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
ck->c.cached = true;
BUG_ON(!six_trylock_intent(&ck->c.lock));
BUG_ON(!six_trylock_write(&ck->c.lock));
ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
struct bkey_cached, rcu);
if (ck)
goto lock;
lock:
six_lock_intent(&ck->c.lock, NULL, NULL);
six_lock_write(&ck->c.lock, NULL, NULL);
return ck;
}
@ -322,21 +177,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
struct bkey_cached *ck;
unsigned i;
mutex_lock(&c->lock);
rcu_read_lock();
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
for (i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
bkey_cached_evict(c, ck);
goto out;
if (bkey_cached_evict(c, ck))
goto out;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
ck = NULL;
out:
rcu_read_unlock();
mutex_unlock(&c->lock);
return ck;
}
@ -415,7 +270,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
bkey_cached_free_fast(bc, ck);
bkey_cached_free(bc, ck);
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
return ret;
@ -611,8 +466,12 @@ evict:
}
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
bkey_cached_free(&c->btree_key_cache, ck);
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
out:
bch2_trans_iter_exit(trans, &b_iter);
@ -722,7 +581,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
bkey_cached_free_fast(bc, ck);
bkey_cached_free(bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
@ -735,48 +594,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
struct bkey_cached *ck, *t;
struct bkey_cached *ck;
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
unsigned start, flags;
unsigned iter, start;
int srcu_idx;
mutex_lock(&bc->lock);
bc->requested_to_free += sc->nr_to_scan;
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
flags = memalloc_nofs_save();
/*
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
bc->nr_freed_nonpcpu--;
bc->freed++;
}
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
break;
list_del(&ck->list);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
bc->nr_freed_pcpu--;
bc->freed++;
}
rcu_read_lock();
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
/*
@ -792,17 +617,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
}
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
start = bc->shrink_iter;
iter = bc->shrink_iter;
if (iter >= tbl->size)
iter = 0;
start = iter;
do {
struct rhash_head *pos, *next;
pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
pos = rht_ptr_rcu(&tbl->buckets[iter]);
while (!rht_is_a_nulls(pos)) {
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
ck = container_of(pos, struct bkey_cached, hash);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@ -812,29 +638,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
bc->skipped_accessed++;
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
} else {
bkey_cached_evict(bc, ck);
} else if (bkey_cached_evict(bc, ck)) {
bkey_cached_free(bc, ck);
bc->moved_to_freelist++;
bc->freed++;
freed++;
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
scanned++;
if (scanned >= nr)
break;
goto out;
pos = next;
}
bc->shrink_iter++;
if (bc->shrink_iter >= tbl->size)
bc->shrink_iter = 0;
} while (scanned < nr && bc->shrink_iter != start);
iter++;
if (iter >= tbl->size)
iter = 0;
} while (scanned < nr && iter != start);
out:
bc->shrink_iter = iter;
rcu_read_unlock();
memalloc_nofs_restore(flags);
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
mutex_unlock(&bc->lock);
return freed;
}
@ -862,18 +690,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct bucket_table *tbl;
struct bkey_cached *ck, *n;
struct bkey_cached *ck;
struct rhash_head *pos;
LIST_HEAD(items);
unsigned i;
#ifdef __KERNEL__
int cpu;
#endif
shrinker_free(bc->shrink);
mutex_lock(&bc->lock);
/*
* The loop is needed to guard against racing with rehash:
*/
@ -892,44 +715,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
for (i = 0; i < tbl->size; i++)
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
ck = container_of(pos, struct bkey_cached, hash);
bkey_cached_evict(bc, ck);
list_add(&ck->list, &items);
BUG_ON(!bkey_cached_evict(bc, ck));
kfree(ck->k);
kmem_cache_free(bch2_key_cache, ck);
}
}
rcu_read_unlock();
}
#ifdef __KERNEL__
if (bc->pcpu_freed) {
for_each_possible_cpu(cpu) {
struct btree_key_cache_freelist *f =
per_cpu_ptr(bc->pcpu_freed, cpu);
for (i = 0; i < f->nr; i++) {
ck = f->objs[i];
list_add(&ck->list, &items);
}
}
}
#endif
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
mutex_unlock(&bc->lock);
list_for_each_entry_safe(ck, n, &items, list) {
cond_resched();
list_del(&ck->list);
kfree(ck->k);
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
}
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
test_bit(BCH_FS_was_rw, &c->flags))
@ -943,14 +736,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
free_percpu(bc->pcpu_freed);
rcu_pending_exit(&bc->pending[0]);
rcu_pending_exit(&bc->pending[1]);
free_percpu(bc->nr_pending);
}
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
{
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->freed_pcpu);
INIT_LIST_HEAD(&c->freed_nonpcpu);
}
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
@ -958,11 +751,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
struct shrinker *shrink;
#ifdef __KERNEL__
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
if (!bc->pcpu_freed)
bc->nr_pending = alloc_percpu(size_t);
if (!bc->nr_pending)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
#endif
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
@ -984,45 +779,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
printbuf_tabstop_push(out, 24);
printbuf_tabstop_push(out, 12);
unsigned flags = memalloc_nofs_save();
mutex_lock(&bc->lock);
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
prt_printf(out, "\nshrinker:\n");
prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
prt_newline(out);
prt_printf(out, "shrinker:\n");
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
struct bkey_cached *ck;
unsigned iter = 0;
list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
if (++iter > 10)
break;
}
iter = 0;
list_for_each_entry(ck, &bc->freed_pcpu, list) {
prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
if (++iter > 10)
break;
}
mutex_unlock(&bc->lock);
memalloc_flags_restore(flags);
prt_newline(out);
prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
}
void bch2_btree_key_cache_exit(void)

View File

@ -2,33 +2,25 @@
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
#include "rcu_pending.h"
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
size_t nr_freed_pcpu;
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
/* 0: non pcpu reader locks, 1: pcpu reader locks */
struct rcu_pending pending[2];
size_t __percpu *nr_pending;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
/* shrinker stats */
unsigned long requested_to_free;
unsigned long freed;
unsigned long moved_to_freelist;
unsigned long skipped_dirty;
unsigned long skipped_accessed;
unsigned long skipped_lock_fail;

View File

@ -218,16 +218,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
bool lock_may_not_fail,
unsigned long ip)
{
int ret;
trans->lock_may_not_fail = lock_may_not_fail;
trans->lock_must_abort = false;
trans->locking = b;
ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
bch2_six_check_for_deadlock, trans, ip);
int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
bch2_six_check_for_deadlock, trans, ip);
WRITE_ONCE(trans->locking, NULL);
WRITE_ONCE(trans->locking_wait.start_time, 0);
if (!ret)
trace_btree_path_lock(trans, _THIS_IP_, b);
return ret;
}
@ -281,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
bch2_trans_verify_not_unlocked(trans);
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@ -400,12 +402,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
/* misc: */
static inline void btree_path_set_should_be_locked(struct btree_path *path)
static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
{
EBUG_ON(!btree_node_locked(path, path->level));
EBUG_ON(path->uptodate);
path->should_be_locked = true;
trace_btree_path_should_be_locked(trans, path);
}
static inline void __btree_path_set_level_up(struct btree_trans *trans,

View File

@ -214,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
overwrite:
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
bch2_bset_insert(b, k, insert, clobber_u64s);
new_u64s = k->u64s;
fix_iter:
if (clobber_u64s != new_u64s)

View File

@ -386,17 +386,16 @@ struct bkey_cached {
struct btree_bkey_cached_common c;
unsigned long flags;
unsigned long btree_trans_barrier_seq;
u16 u64s;
struct bkey_cached_key key;
struct rhash_head hash;
struct list_head list;
struct journal_entry_pin journal;
u64 seq;
struct bkey_i *k;
struct rcu_head rcu;
};
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)

View File

@ -374,7 +374,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
i->key_cache_already_flushed = true;
i->flags |= BTREE_TRIGGER_norun;
btree_path_set_should_be_locked(btree_path);
btree_path_set_should_be_locked(trans, btree_path);
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
bch2_path_put(trans, path_idx, true);
@ -422,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
break;
}
if (!cmp && i < trans->updates + trans->nr_updates) {
bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
if (overwrite) {
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
bch2_path_put(trans, i->path, true);
@ -449,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
}
}
__btree_path_get(trans->paths + i->path, true);
__btree_path_get(trans, trans->paths + i->path, true);
trace_update_by_path(trans, path, i, overwrite);
/*
* If a key is present in the key cache, it must also exist in the
@ -498,7 +502,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
}
return 0;

View File

@ -731,6 +731,18 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s", bch2_err_str(ret));
err:
/*
* Ensure transaction is unlocked before using btree_node_lock_nopath()
* (the use of which is always suspect, we need to work on removing this
* in the future)
*
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
* calls bch2_path_upgrade(), before we call path_make_mut(), so we may
* rarely end up with a locked path besides the one we have here:
*/
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
/*
* We have to be careful because another thread might be getting ready
* to free as->b and calling btree_update_reparent() on us - we'll
@ -750,18 +762,6 @@ err:
* we're in journal error state:
*/
/*
* Ensure transaction is unlocked before using
* btree_node_lock_nopath() (the use of which is always suspect,
* we need to work on removing this in the future)
*
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
* calls bch2_path_upgrade(), before we call path_make_mut(), so
* we may rarely end up with a locked path besides the one we
* have here:
*/
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
@ -1981,7 +1981,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (ret)
goto err;
btree_path_set_should_be_locked(trans->paths + sib_path);
btree_path_set_should_be_locked(trans, trans->paths + sib_path);
m = trans->paths[sib_path].l[level].b;

View File

@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
unsigned level,
unsigned flags)
{
bch2_trans_verify_not_unlocked(trans);
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
btree_prev_sib) ?:
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,

View File

@ -740,7 +740,7 @@ static int __trigger_extent(struct btree_trans *trans,
return ret;
} else if (!p.has_ec) {
*replicas_sectors += disk_sectors;
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)

View File

@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
TASK_UNINTERRUPTIBLE);
}
static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
{
return rcu_dereference_check(ca->buckets_gc,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
lockdep_is_held(&ca->fs->state_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
return NULL;
return buckets->b + b;
return genradix_ptr(&ca->buckets_gc, b);
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)

View File

@ -19,14 +19,6 @@ struct bucket {
u32 stripe_sectors;
} __aligned(sizeof(long));
struct bucket_array {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
size_t nbuckets_minus_first;
struct bucket b[];
};
struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;

View File

@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_timeout(wq, condition, timeout); \
__ret; \
})
void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
void bch2_io_clock_exit(struct io_clock *);

View File

@ -4,12 +4,12 @@
#include <linux/slab.h>
#include "darray.h"
int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
{
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
void *data = kvmalloc_array(new_size, element_size, gfp);
void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
if (!data)
return -ENOMEM;

View File

@ -22,29 +22,23 @@ struct { \
typedef DARRAY(char) darray_char;
typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
static inline int __darray_resize(darray_char *d, size_t element_size,
size_t new_size, gfp_t gfp)
{
return unlikely(new_size > d->size)
? __bch2_darray_resize(d, element_size, new_size, gfp)
: 0;
}
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
#define __darray_resize(_d, _element_size, _new_size, _gfp) \
(unlikely((_new_size) > (_d)->size) \
? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
: 0)
#define darray_resize_gfp(_d, _new_size, _gfp) \
unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
#define darray_resize(_d, _new_size) \
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
{
return __darray_resize(d, t_size, d->nr + more, gfp);
}
#define darray_make_room_gfp(_d, _more, _gfp) \
__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
#define darray_make_room(_d, _more) \
darray_make_room_gfp(_d, _more, GFP_KERNEL)

View File

@ -337,6 +337,7 @@ restart_drop_extra_replicas:
printbuf_exit(&buf);
bch2_fatal_error(c);
ret = -EIO;
goto out;
}

View File

@ -552,62 +552,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum target;
u32 snapshot;
struct bkey_buf sk;
int ret;
bch2_bkey_buf_init(&sk);
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
int ret = bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
POS(inum.inum, ctx->pos),
POS(inum.inum, U64_MAX),
inum.subvol, 0, k, ({
if (k.k->type != KEY_TYPE_dirent)
continue;
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
SPOS(inum.inum, ctx->pos, snapshot),
POS(inum.inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_dirent)
continue;
/* dir_emit() can fault and block: */
bch2_bkey_buf_reassemble(&sk, c, k);
struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
/* dir_emit() can fault and block: */
bch2_bkey_buf_reassemble(&sk, c, k);
struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
subvol_inum target;
int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret2 > 0)
continue;
ret = bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret < 0)
break;
if (ret)
continue;
ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
})));
/*
* read_target looks up subvolumes, we can overflow paths if the
* directory has many subvolumes in it
*
* XXX: btree_trans_too_many_iters() is something we'd like to
* get rid of, and there's no good reason to be using it here
* except that we don't yet have a for_each_btree_key() helper
* that does subvolume_get_snapshot().
*/
ret = drop_locks_do(trans,
bch2_dir_emit(ctx, dirent, target)) ?:
btree_trans_too_many_iters(trans);
if (ret) {
ret = ret < 0 ? ret : 0;
break;
}
}
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
return ret;
return ret < 0 ? ret : 0;
}

View File

@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
if (p1.ptr.dev == p2.ptr.dev &&
p1.ptr.gen == p2.ptr.gen &&
/*
* This checks that the two pointers point
* to the same region on disk - adjusting
* for the difference in where the extents
* start, since one may have been trimmed:
*/
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
/*
* This additionally checks that the
* extents overlap on disk, since the
* previous check may trigger spuriously
* when one extent is immediately partially
* overwritten with another extent (so that
* on disk they are adjacent) and
* compression is in use:
*/
((p1.ptr.offset >= p2.ptr.offset &&
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
(p2.ptr.offset >= p1.ptr.offset &&
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
return true;
return false;

View File

@ -357,7 +357,7 @@ out: \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
#define bkey_crc_next(_k, _end, _crc, _iter) \
({ \
__bkey_extent_entry_for_each_from(_iter, _end, _iter) \
if (extent_entry_is_crc(_iter)) { \
@ -372,7 +372,7 @@ out: \
#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
(_iter) = (_start); \
bkey_crc_next(_k, _start, _end, _crc, _iter); \
bkey_crc_next(_k, _end, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
#define bkey_for_each_crc(_k, _p, _crc, _iter) \

View File

@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
struct bkey_buf sk;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
u32 snapshot;
int ret = 0;
rbio->c = c;
@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans,
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
retry:
bch2_trans_begin(trans);
iter = (struct btree_iter) { NULL };
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
POS(inum.inum, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_slots);
while (1) {
struct bkey_s_c k;
unsigned bytes, sectors, offset_into_extent;
enum btree_id data_btree = BTREE_ID_extents;
/*
* read_extent -> io_time_reset may cause a transaction restart
* without returning an error, we need to check for that here:
*/
ret = bch2_trans_relock(trans);
bch2_trans_begin(trans);
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
break;
goto err;
bch2_btree_iter_set_snapshot(&iter, snapshot);
bch2_btree_iter_set_pos(&iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
@ -189,7 +182,7 @@ retry:
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
break;
goto err;
offset_into_extent = iter.pos.offset -
bkey_start_offset(k.k);
@ -200,7 +193,7 @@ retry:
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &sk);
if (ret)
break;
goto err;
k = bkey_i_to_s_c(sk.k);
@ -210,7 +203,7 @@ retry:
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
extent_partial_reads_expensive(k));
if (ret)
break;
goto err;
}
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
@ -229,17 +222,13 @@ retry:
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
ret = btree_trans_too_many_iters(trans);
if (ret)
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (ret) {
bch_err_inum_offset_ratelimited(c,
iter.pos.inode,
@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
op->subvol = inode->ei_subvol;
op->subvol = inode->ei_inum.subvol;
op->pos = POS(inode->v.i_ino, sector);
op->end_io = bch2_writepage_io_done;
op->devs_need_flush = &inode->ei_devs_need_flush;

View File

@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.target = dio->op.opts.foreground_target;
dio->op.write_point = writepoint_hashed((unsigned long) current);
dio->op.nr_replicas = dio->op.opts.data_replicas;
dio->op.subvol = inode->ei_subvol;
dio->op.subvol = inode->ei_inum.subvol;
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
dio->op.devs_need_flush = &inode->ei_devs_need_flush;

View File

@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
struct folio **fs, unsigned nr_folios)
{
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bch_folio *s;
u64 offset = folio_sector(fs[0]);
unsigned folio_idx;
u32 snapshot;
bool need_set = false;
int ret;
for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
if (!s)
return -ENOMEM;
@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
if (!need_set)
return 0;
folio_idx = 0;
trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
unsigned folio_idx = 0;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
return bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
POS(inum.inum, offset),
POS(inum.inum, U64_MAX),
inum.subvol, BTREE_ITER_slots, k, ({
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
unsigned state = bkey_to_sector_state(k);
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inum.inum, offset, snapshot),
BTREE_ITER_slots, k, ret) {
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
unsigned state = bkey_to_sector_state(k);
while (folio_idx < nr_folios) {
struct folio *folio = fs[folio_idx];
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
folio_start;
unsigned folio_len = min(k.k->p.offset, folio_end) -
folio_offset - folio_start;
while (folio_idx < nr_folios) {
struct folio *folio = fs[folio_idx];
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
folio_start;
unsigned folio_len = min(k.k->p.offset, folio_end) -
folio_offset - folio_start;
BUG_ON(k.k->p.offset < folio_start);
BUG_ON(bkey_start_offset(k.k) > folio_end);
BUG_ON(k.k->p.offset < folio_start);
BUG_ON(bkey_start_offset(k.k) > folio_end);
if (!bch2_folio(folio)->uptodate)
__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
if (!bch2_folio(folio)->uptodate)
__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
if (k.k->p.offset < folio_end)
break;
folio_idx++;
}
if (k.k->p.offset < folio_end)
if (folio_idx == nr_folios)
break;
folio_idx++;
}
if (folio_idx == nr_folios)
break;
}
offset = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_put(trans);
return ret;
0;
})));
}
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)

View File

@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
static inline struct bch_folio *__bch2_folio(struct folio *folio)
{
return folio_has_private(folio)
? (struct bch_folio *) folio_get_private(folio)
: NULL;
return folio_get_private(folio);
}
static inline struct bch_folio *bch2_folio(struct folio *folio)

View File

@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
struct bpos start,
struct bpos end)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
if (ret)
goto err;
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
ret = 1;
break;
}
start = iter.pos;
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_put(trans);
return ret;
return bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
subvol, 0, k, ({
bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
})));
}
static int __bch2_truncate_folio(struct bch_inode_info *inode,
@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
* XXX: we're doing two index lookups when we end up reading the
* folio
*/
ret = range_has_data(c, inode->ei_subvol,
ret = range_has_data(c, inode->ei_inum.subvol,
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
if (ret <= 0)
@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans,
inode->ei_subvol, &snapshot);
inode->ei_inum.subvol, &snapshot);
if (ret)
goto bkey_err;
@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
u64 start, u64 end)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
u32 snapshot;
u64 sectors = end - start;
u64 pos = start;
int ret;
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
if (ret)
goto err;
int ret = bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter,
BTREE_ID_extents,
POS(inode->v.i_ino, start),
POS(inode->v.i_ino, end - 1),
inode->ei_inum.subvol, 0, k, ({
if (bkey_extent_is_allocation(k.k)) {
u64 s = min(end, k.k->p.offset) -
max(start, bkey_start_offset(k.k));
BUG_ON(s > sectors);
sectors -= s;
}
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, pos, snapshot), 0);
while (!(ret = btree_trans_too_many_iters(trans)) &&
(k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
!(ret = bkey_err(k))) {
if (bkey_extent_is_allocation(k.k)) {
u64 s = min(end, k.k->p.offset) -
max(start, bkey_start_offset(k.k));
BUG_ON(s > sectors);
sectors -= s;
}
bch2_btree_iter_advance(&iter);
}
pos = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_put(trans);
0;
})));
return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
}
@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_data = MAX_LFS_FILESIZE;
u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, offset >> 9, snapshot),
POS(inode->v.i_ino, U64_MAX),
0, k, ret) {
if (bkey_extent_is_data(k.k)) {
next_data = max(offset, bkey_start_offset(k.k) << 9);
break;
} else if (k.k->p.offset >> 9 > isize)
break;
}
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_put(trans);
int ret = bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9),
POS(inode->v.i_ino, U64_MAX),
inum.subvol, 0, k, ({
if (bkey_extent_is_data(k.k)) {
next_data = max(offset, bkey_start_offset(k.k) << 9);
break;
} else if (k.k->p.offset >> 9 > isize)
break;
0;
})));
if (ret)
return ret;
@ -995,50 +941,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
subvol_inum inum = inode_inum(inode);
u64 isize, next_hole = MAX_LFS_FILESIZE;
u32 snapshot;
int ret;
isize = i_size_read(&inode->v);
if (offset >= isize)
return -ENXIO;
trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
goto err;
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
SPOS(inode->v.i_ino, offset >> 9, snapshot),
BTREE_ITER_slots, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE, 0, false);
break;
} else if (!bkey_extent_is_data(k.k)) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
max(offset, bkey_start_offset(k.k) << 9),
k.k->p.offset << 9, 0, false);
if (next_hole < k.k->p.offset << 9)
int ret = bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
POS(inode->v.i_ino, offset >> 9),
POS(inode->v.i_ino, U64_MAX),
inum.subvol, BTREE_ITER_slots, k, ({
if (k.k->p.inode != inode->v.i_ino) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE, 0, false);
break;
} else {
offset = max(offset, bkey_start_offset(k.k) << 9);
}
}
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
} else if (!bkey_extent_is_data(k.k)) {
next_hole = bch2_seek_pagecache_hole(&inode->v,
max(offset, bkey_start_offset(k.k) << 9),
k.k->p.offset << 9, 0, false);
bch2_trans_put(trans);
if (next_hole < k.k->p.offset << 9)
break;
} else {
offset = max(offset, bkey_start_offset(k.k) << 9);
}
0;
})));
if (ret)
return ret;

View File

@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
ATTR_CTIME);
mutex_unlock(&inode->ei_update_lock);
@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
}
mutex_lock(&inode->ei_update_lock);
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
bch2_set_projid(c, inode, fa.fsx_projid) ?:
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
ATTR_CTIME);

View File

@ -108,7 +108,7 @@ retry:
goto retry;
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
"%s: inode %u:%llu not found when updating",
"%s: inode %llu:%llu not found when updating",
bch2_err_str(ret),
inode_inum(inode).subvol,
inode_inum(inode).inum);
@ -152,42 +152,101 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
static int bch2_iget5_test(struct inode *vinode, void *p)
static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
subvol_inum *inum = p;
return inode->ei_subvol == inum->subvol &&
inode->ei_inode.bi_inum == inum->inum;
return a.subvol == b.subvol && a.inum == b.inum;
}
static int bch2_iget5_set(struct inode *vinode, void *p)
static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
struct bch_inode_info *inode = to_bch_ei(vinode);
subvol_inum *inum = p;
const struct bch_inode_info *inode = obj;
const subvol_inum *v = arg->key;
inode->v.i_ino = inum->inum;
inode->ei_subvol = inum->subvol;
inode->ei_inode.bi_inum = inum->inum;
return 0;
return !subvol_inum_eq(inode->ei_inum, *v);
}
static unsigned bch2_inode_hash(subvol_inum inum)
static const struct rhashtable_params bch2_vfs_inodes_params = {
.head_offset = offsetof(struct bch_inode_info, hash),
.key_offset = offsetof(struct bch_inode_info, ei_inum),
.key_len = sizeof(subvol_inum),
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
.automatic_shrinking = true,
};
static void __wait_on_freeing_inode(struct inode *inode)
{
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
}
static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
subvol_inum inum)
{
subvol_inum inum = inode_inum(inode);
struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
BUG_ON(!old);
struct bch_inode_info *inode;
repeat:
inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
if (inode) {
spin_lock(&inode->v.i_lock);
if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
spin_unlock(&inode->v.i_lock);
return NULL;
}
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
if (!trans) {
__wait_on_freeing_inode(&inode->v);
} else {
bch2_trans_unlock(trans);
__wait_on_freeing_inode(&inode->v);
int ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
goto repeat;
}
__iget(&inode->v);
spin_unlock(&inode->v.i_lock);
}
return inode;
}
static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
{
spin_lock(&inode->v.i_lock);
bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
spin_unlock(&inode->v.i_lock);
if (remove) {
int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
&inode->hash, bch2_vfs_inodes_params);
BUG_ON(ret);
inode->v.i_hash.pprev = NULL;
}
}
static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
struct btree_trans *trans,
struct bch_inode_info *inode)
{
struct bch_inode_info *old = inode;
set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
&inode->hash,
bch2_vfs_inodes_params))) {
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
if (!old)
goto retry;
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
if (unlikely(old != inode)) {
/*
* bcachefs doesn't use I_NEW; we have no use for it since we
* only insert fully created inodes in the inode hash table. But
@ -201,21 +260,17 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
*/
set_nlink(&inode->v, 1);
discard_new_inode(&inode->v);
inode = old;
return old;
} else {
inode_fake_hash(&inode->v);
inode_sb_list_add(&inode->v);
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
* Again, I_NEW makes no sense for bcachefs. This is only needed
* for clearing I_NEW, but since the inode was already fully
* created and initialized we didn't actually want
* inode_insert5() to set it for us.
*/
unlock_new_inode(&inode->v);
return inode;
}
return inode;
}
#define memalloc_flags_do(_flags, _do) \
@ -233,7 +288,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
{
struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
bch2_inode_cache, GFP_NOFS);
if (!inode)
return NULL;
@ -275,13 +331,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
return inode;
}
static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
struct bch_inode_info *inode = bch2_new_inode(trans);
if (IS_ERR(inode))
return inode;
bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
return bch2_inode_hash_insert(trans->c, trans, inode);
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_info *inode =
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
if (inode)
return &inode->v;
@ -292,11 +359,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
int ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (!ret) {
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
}
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_trans_put(trans);
return ret ? ERR_PTR(ret) : &inode->v;
@ -317,6 +380,8 @@ __bch2_create(struct mnt_idmap *idmap,
subvol_inum inum;
struct bch_subvolume subvol;
u64 journal_seq = 0;
kuid_t kuid;
kgid_t kgid;
int ret;
/*
@ -343,13 +408,15 @@ __bch2_create(struct mnt_idmap *idmap,
retry:
bch2_trans_begin(trans);
ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
bch2_create_trans(trans,
inode_inum(dir), &dir_u, &inode_u,
!(flags & BCH_CREATE_TMPFILE)
? &dentry->d_name : NULL,
from_kuid(i_user_ns(&dir->v), current_fsuid()),
from_kgid(i_user_ns(&dir->v), current_fsgid()),
from_kuid(i_user_ns(&dir->v), kuid),
from_kgid(i_user_ns(&dir->v), kgid),
mode, rdev,
default_acl, acl, snapshot_src, flags) ?:
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
@ -357,7 +424,7 @@ retry:
if (unlikely(ret))
goto err_before_quota;
inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
inum.inum = inode_u.bi_inum;
ret = bch2_subvolume_get(trans, inum.subvol, true,
@ -387,8 +454,16 @@ err_before_quota:
* we must insert the new inode into the inode cache before calling
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*
* also, calling bch2_inode_hash_insert() without passing in the
* transaction object is sketchy - if we could ever end up in
* __wait_on_freeing_inode(), we'd risk deadlock.
*
* But that shouldn't be possible, since we still have the inode locked
* that we just created, and we _really_ can't take a transaction
* restart here.
*/
inode = bch2_inode_insert(c, inode);
inode = bch2_inode_hash_insert(c, NULL, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@ -428,11 +503,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (ret)
goto err;
struct bch_inode_info *inode =
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
if (inode)
goto out;
@ -440,7 +511,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
c, "dirent to missing inode:\n %s",
@ -460,9 +531,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
ret = -ENOENT;
goto err;
}
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@ -549,8 +617,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
lockdep_assert_held(&inode->v.i_rwsem);
ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
bch2_subvol_is_ro(c, inode->ei_subvol) ?:
ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
return bch2_err_class(ret);
@ -606,7 +674,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
__bch2_unlink(vdir, dentry, false);
return bch2_err_class(ret);
}
@ -689,8 +757,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
trans = bch2_trans_get(c);
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
goto err;
@ -771,11 +839,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
unsigned int ia_valid = attr->ia_valid;
kuid_t kuid;
kgid_t kgid;
if (ia_valid & ATTR_UID)
bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
if (ia_valid & ATTR_GID)
bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
if (ia_valid & ATTR_UID) {
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
}
if (ia_valid & ATTR_GID) {
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
}
if (ia_valid & ATTR_SIZE)
bi->bi_size = attr->ia_size;
@ -790,11 +864,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
kgid_t gid = ia_valid & ATTR_GID
? attr->ia_gid
? kgid
: inode->v.i_gid;
if (!in_group_p(gid) &&
!capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
if (!in_group_or_capable(idmap, &inode->v,
make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
mode &= ~S_ISGID;
bi->bi_mode = mode;
}
@ -810,17 +884,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct btree_iter inode_iter = { NULL };
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
kuid_t kuid;
kgid_t kgid;
int ret;
mutex_lock(&inode->ei_update_lock);
qid = inode->ei_qid;
if (attr->ia_valid & ATTR_UID)
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
if (attr->ia_valid & ATTR_UID) {
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
}
if (attr->ia_valid & ATTR_GID)
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
if (attr->ia_valid & ATTR_GID) {
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
}
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
KEY_TYPE_QUOTA_PREALLOC);
@ -876,13 +956,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
stat->dev = inode->v.i_sb->s_dev;
stat->ino = inode->v.i_ino;
stat->mode = inode->v.i_mode;
stat->nlink = inode->v.i_nlink;
stat->uid = inode->v.i_uid;
stat->gid = inode->v.i_gid;
stat->uid = vfsuid_into_kuid(vfsuid);
stat->gid = vfsgid_into_kgid(vfsgid);
stat->rdev = inode->v.i_rdev;
stat->size = i_size_read(&inode->v);
stat->atime = inode_get_atime(&inode->v);
@ -891,7 +973,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->blksize = block_bytes(c);
stat->blocks = inode->v.i_blocks;
stat->subvol = inode->ei_subvol;
stat->subvol = inode->ei_inum.subvol;
stat->result_mask |= STATX_SUBVOL;
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
@ -933,7 +1015,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
lockdep_assert_held(&inode->v.i_rwsem);
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
setattr_prepare(idmap, dentry, iattr);
if (ret)
return ret;
@ -1026,7 +1108,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct bkey_buf cur, prev;
unsigned offset_into_extent, sectors;
bool have_extent = false;
u32 snapshot;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@ -1042,21 +1123,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
if (ret)
goto err;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(ei->v.i_ino, start, snapshot), 0);
POS(ei->v.i_ino, start), 0);
while (!(ret = btree_trans_too_many_iters(trans)) &&
(k = bch2_btree_iter_peek_upto(&iter, end)).k &&
!(ret = bkey_err(k))) {
while (true) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
if (ret)
goto err;
bch2_btree_iter_set_snapshot(&iter, snapshot);
k = bch2_btree_iter_peek_upto(&iter, end);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k)
break;
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(&iter);
@ -1100,16 +1190,12 @@ retry:
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
ret = bch2_trans_relock(trans);
if (ret)
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
start = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!ret && have_extent) {
bch2_trans_unlock(trans);
@ -1165,7 +1251,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
if (ret)
return ret;
}
@ -1297,8 +1383,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
{
return (struct bcachefs_fid) {
.inum = inode->ei_inode.bi_inum,
.subvol = inode->ei_subvol,
.inum = inode->ei_inum.inum,
.subvol = inode->ei_inum.subvol,
.gen = inode->ei_inode.bi_generation,
};
}
@ -1383,7 +1469,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
subvol_inum parent_inum = {
.subvol = inode->ei_inode.bi_parent_subvol ?:
inode->ei_subvol,
inode->ei_inum.subvol,
.inum = inode->ei_inode.bi_dir,
};
@ -1419,7 +1505,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
retry:
bch2_trans_begin(trans);
ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
if (ret)
goto err;
@ -1450,8 +1536,7 @@ retry:
if (ret)
goto err;
if (target.subvol == inode->ei_subvol &&
target.inum == inode->ei_inode.bi_inum)
if (subvol_inum_eq(target, inode->ei_inum))
goto found;
} else {
/*
@ -1472,8 +1557,7 @@ retry:
if (ret)
continue;
if (target.subvol == inode->ei_subvol &&
target.inum == inode->ei_inode.bi_inum)
if (subvol_inum_eq(target, inode->ei_inum))
goto found;
}
}
@ -1505,12 +1589,15 @@ static const struct export_operations bch_export_ops = {
.get_name = bch2_get_name,
};
static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
static void bch2_vfs_inode_init(struct btree_trans *trans,
subvol_inum inum,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
bch2_iget5_set(&inode->v, &inum);
inode->v.i_ino = inum.inum;
inode->ei_inum = inum;
inode->ei_inode.bi_inum = inum.inum;
bch2_inode_update_after_write(trans, inode, bi, ~0);
inode->v.i_blocks = bi->bi_sectors;
@ -1522,7 +1609,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
inode->ei_flags = 0;
inode->ei_quota_reserved = 0;
inode->ei_qid = bch_qid(bi);
inode->ei_subvol = inum.subvol;
if (BCH_SUBVOLUME_SNAP(subvol))
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
@ -1590,6 +1676,12 @@ static void bch2_evict_inode(struct inode *vinode)
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(vinode);
/*
* evict() has waited for outstanding writeback, we'll do no more IO
* through this inode: it's safe to remove from VFS inode hashtable here
*/
bch2_inode_hash_remove(c, inode);
truncate_inode_pages_final(&inode->v.i_data);
clear_inode(&inode->v);
@ -1631,7 +1723,7 @@ again:
mutex_lock(&c->vfs_inodes_lock);
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
if (!snapshot_list_has_id(s, inode->ei_subvol))
if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
continue;
if (!(inode->v.i_state & I_DONTCACHE) &&
@ -2119,12 +2211,23 @@ static int bch2_init_fs_context(struct fs_context *fc)
return 0;
}
void bch2_fs_vfs_exit(struct bch_fs *c)
{
if (c->vfs_inodes_table.tbl)
rhashtable_destroy(&c->vfs_inodes_table);
}
int bch2_fs_vfs_init(struct bch_fs *c)
{
return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
}
static struct file_system_type bcache_fs_type = {
.owner = THIS_MODULE,
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("bcachefs");
@ -2139,7 +2242,8 @@ int __init bch2_vfs_init(void)
{
int ret = -ENOMEM;
bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
SLAB_ACCOUNT);
if (!bch2_inode_cache)
goto err;

View File

@ -13,6 +13,9 @@
struct bch_inode_info {
struct inode v;
struct rhash_head hash;
subvol_inum ei_inum;
struct list_head ei_vfs_inode_list;
unsigned long ei_flags;
@ -24,8 +27,6 @@ struct bch_inode_info {
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
u32 ei_subvol;
/*
* When we've been doing nocow writes we'll need to issue flushes to the
* underlying block devices
@ -50,10 +51,7 @@ struct bch_inode_info {
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
{
return (subvol_inum) {
.subvol = inode->ei_subvol,
.inum = inode->ei_inode.bi_inum,
};
return inode->ei_inum;
}
/*
@ -67,6 +65,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
* those:
*/
#define EI_INODE_SNAPSHOT 1
#define EI_INODE_HASHED 2
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
@ -187,6 +186,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
void bch2_fs_vfs_exit(struct bch_fs *);
int bch2_fs_vfs_init(struct bch_fs *);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
@ -196,6 +198,10 @@ int bch2_vfs_init(void);
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }

View File

@ -365,7 +365,7 @@ int bch2_inode_peek(struct btree_trans *trans,
subvol_inum inum, unsigned flags)
{
int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
return ret;
}

View File

@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
*/
bool promote_full = (failed ||
*read_full ||
READ_ONCE(c->promote_whole_extents));
READ_ONCE(c->opts.promote_whole_extents));
/* data might have to be decompressed in the write path: */
unsigned sectors = promote_full
? max(pick->crc.compressed_size, pick->crc.live_size)
@ -1214,10 +1214,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
ret = btree_trans_too_many_iters(trans);
if (ret)
goto err;
err:
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&

View File

@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
.e.nr_devs = 0,
.e.nr_required = 1,
};
@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
goto err;
darray_for_each(i->ptrs, ptr)
replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
replicas_entry_add_dev(&replicas.e, ptr->dev);
bch2_replicas_entry_sort(&replicas.e);
@ -1950,7 +1951,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
time_before(jiffies, j->last_flush_write +
msecs_to_jiffies(c->opts.journal_flush_delay)) &&
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);

View File

@ -230,6 +230,8 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = U64_MAX, \
.choices = _choices
#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
.choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
@ -376,6 +378,13 @@ int bch2_opt_parse(struct bch_fs *c,
*res = ret;
break;
case BCH_OPT_BITFIELD: {
s64 v = bch2_read_flag_list(val, opt->choices);
if (v < 0)
return v;
*res = v;
break;
}
case BCH_OPT_FN:
ret = opt->fn.parse(c, val, res, err);
@ -608,10 +617,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
return 0;
}
void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
struct bch_dev_sb_opt_set {
void (*set_sb)(struct bch_member *, u64);
};
static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
BCH_DEV_OPT_SETTERS()
#undef x
};
void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
const struct bch_option *opt, u64 v)
{
if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
enum bch_opt_id id = opt - bch2_opt_table;
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@ -619,16 +638,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
if (opt->flags & OPT_SB_FIELD_ILOG2)
v = ilog2(v);
opt->set_sb(sb, v);
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
v++;
if (opt->flags & OPT_FS) {
if (opt->set_sb != SET_BCH2_NO_SB_OPT)
opt->set_sb(sb, v);
}
if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
if (WARN(!bch2_member_exists(sb, dev_idx),
"tried to set device option %s on nonexistent device %i",
opt->attr.name, dev_idx))
return;
struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
if (set->set_sb)
set->set_sb(m, v);
else
pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
}
}
void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
const struct bch_option *opt, u64 v)
{
if (opt->set_sb == SET_BCH2_NO_SB_OPT)
return;
mutex_lock(&c->sb_lock);
__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
__bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}

View File

@ -53,23 +53,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
/* When can be set: */
enum opt_flags {
OPT_FS = (1 << 0), /* Filesystem option */
OPT_DEVICE = (1 << 1), /* Device option */
OPT_INODE = (1 << 2), /* Inode option */
OPT_FORMAT = (1 << 3), /* May be specified at format time */
OPT_MOUNT = (1 << 4), /* May be specified at mount time */
OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
OPT_HUMAN_READABLE = (1 << 6),
OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
OPT_HIDDEN = (1 << 10),
OPT_FS = BIT(0), /* Filesystem option */
OPT_DEVICE = BIT(1), /* Device option */
OPT_INODE = BIT(2), /* Inode option */
OPT_FORMAT = BIT(3), /* May be specified at format time */
OPT_MOUNT = BIT(4), /* May be specified at mount time */
OPT_RUNTIME = BIT(5), /* May be specified at runtime */
OPT_HUMAN_READABLE = BIT(6),
OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
OPT_HIDDEN = BIT(11),
};
enum opt_type {
BCH_OPT_BOOL,
BCH_OPT_UINT,
BCH_OPT_STR,
BCH_OPT_BITFIELD,
BCH_OPT_FN,
};
@ -263,6 +265,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Enable inline data extents") \
x(promote_whole_extents, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
NULL, "Promote whole extents, instead of just part being read")\
x(acl, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
OPT_BOOL(), \
@ -472,11 +479,16 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, 0, \
"size", "Size of filesystem on device") \
x(durability, u8, \
OPT_DEVICE, \
OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times") \
x(data_allowed, u8, \
OPT_DEVICE, \
OPT_BITFIELD(__bch2_data_types), \
BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
"types", "Allowed data types for this device: journal, btree, and/or user")\
x(btree_node_prefetch, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@ -484,6 +496,11 @@ enum fsck_err_opts {
NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
" prefetched sequentially")
#define BCH_DEV_OPT_SETTERS() \
x(discard, BCH_MEMBER_DISCARD) \
x(durability, BCH_MEMBER_DURABILITY) \
x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
@ -563,8 +580,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
struct bch_dev;
void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);

650
fs/bcachefs/rcu_pending.c Normal file
View File

@ -0,0 +1,650 @@
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
#include <linux/generic-radix-tree.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/srcu.h>
#include <linux/vmalloc.h>
#include "rcu_pending.h"
#include "darray.h"
#include "util.h"
#define static_array_for_each(_a, _i) \
for (typeof(&(_a)[0]) _i = _a; \
_i < (_a) + ARRAY_SIZE(_a); \
_i++)
enum rcu_pending_special {
RCU_PENDING_KVFREE = 1,
RCU_PENDING_CALL_RCU = 2,
};
#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
{
return ssp
? get_state_synchronize_srcu(ssp)
: get_state_synchronize_rcu();
}
static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
{
return ssp
? start_poll_synchronize_srcu(ssp)
: start_poll_synchronize_rcu();
}
static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
{
return ssp
? poll_state_synchronize_srcu(ssp, cookie)
: poll_state_synchronize_rcu(cookie);
}
static inline void __rcu_barrier(struct srcu_struct *ssp)
{
return ssp
? srcu_barrier(ssp)
: rcu_barrier();
}
static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
{
if (ssp)
call_srcu(ssp, rhp, func);
else
call_rcu(rhp, func);
}
struct rcu_pending_seq {
/*
* We're using a radix tree like a vector - we're just pushing elements
* onto the end; we're using a radix tree instead of an actual vector to
* avoid reallocation overhead
*/
GENRADIX(struct rcu_head *) objs;
size_t nr;
struct rcu_head **cursor;
unsigned long seq;
};
struct rcu_pending_list {
struct rcu_head *head;
struct rcu_head *tail;
unsigned long seq;
};
struct rcu_pending_pcpu {
struct rcu_pending *parent;
spinlock_t lock;
int cpu;
/*
* We can't bound the number of unprocessed gp sequence numbers, and we
* can't efficiently merge radix trees for expired grace periods, so we
* need darray/vector:
*/
DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
/* Third entry is for expired objects: */
struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
struct rcu_head cb;
bool cb_armed;
struct work_struct work;
};
static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
{
if (p->objs.nr)
return true;
static_array_for_each(p->lists, i)
if (i->head)
return true;
return false;
}
static void rcu_pending_list_merge(struct rcu_pending_list *l1,
struct rcu_pending_list *l2)
{
#ifdef __KERNEL__
if (!l1->head)
l1->head = l2->head;
else
l1->tail->next = l2->head;
#else
if (!l1->head)
l1->head = l2->head;
else
l1->tail->next.next = (void *) l2->head;
#endif
l1->tail = l2->tail;
l2->head = l2->tail = NULL;
}
static void rcu_pending_list_add(struct rcu_pending_list *l,
struct rcu_head *n)
{
#ifdef __KERNEL__
if (!l->head)
l->head = n;
else
l->tail->next = n;
l->tail = n;
n->next = NULL;
#else
if (!l->head)
l->head = n;
else
l->tail->next.next = (void *) n;
l->tail = n;
n->next.next = NULL;
#endif
}
static void merge_expired_lists(struct rcu_pending_pcpu *p)
{
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
for (struct rcu_pending_list *i = p->lists; i < expired; i++)
if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
rcu_pending_list_merge(expired, i);
}
#ifndef __KERNEL__
static inline void kfree_bulk(size_t nr, void ** p)
{
while (nr--)
kfree(*p);
}
#define local_irq_save(flags) \
do { \
flags = 0; \
} while (0)
#endif
static noinline void __process_finished_items(struct rcu_pending *pending,
struct rcu_pending_pcpu *p,
unsigned long flags)
{
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
struct rcu_pending_seq objs = {};
struct rcu_head *list = NULL;
if (p->objs.nr &&
__poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
objs = p->objs.data[0];
darray_remove_item(&p->objs, p->objs.data);
}
merge_expired_lists(p);
list = expired->head;
expired->head = expired->tail = NULL;
spin_unlock_irqrestore(&p->lock, flags);
switch ((ulong) pending->process) {
case RCU_PENDING_KVFREE:
for (size_t i = 0; i < objs.nr; ) {
size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
i += nr_this_node;
}
genradix_free(&objs.objs);
while (list) {
struct rcu_head *obj = list;
#ifdef __KERNEL__
list = obj->next;
#else
list = (void *) obj->next.next;
#endif
/*
* low bit of pointer indicates whether rcu_head needs
* to be freed - kvfree_rcu_mightsleep()
*/
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
bool free_head = ((unsigned long) obj->func) & 1UL;
kvfree(ptr);
if (free_head)
kfree(obj);
}
break;
case RCU_PENDING_CALL_RCU:
for (size_t i = 0; i < objs.nr; i++) {
struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
obj->func(obj);
}
genradix_free(&objs.objs);
while (list) {
struct rcu_head *obj = list;
#ifdef __KERNEL__
list = obj->next;
#else
list = (void *) obj->next.next;
#endif
obj->func(obj);
}
break;
default:
for (size_t i = 0; i < objs.nr; i++)
pending->process(pending, *genradix_ptr(&objs.objs, i));
genradix_free(&objs.objs);
while (list) {
struct rcu_head *obj = list;
#ifdef __KERNEL__
list = obj->next;
#else
list = (void *) obj->next.next;
#endif
pending->process(pending, obj);
}
break;
}
}
static bool process_finished_items(struct rcu_pending *pending,
struct rcu_pending_pcpu *p,
unsigned long flags)
{
/*
* XXX: we should grab the gp seq once and avoid multiple function
* calls, this is called from __rcu_pending_enqueue() fastpath in
* may_sleep==true mode
*/
if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
(p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
(p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
p->lists[2].head) {
__process_finished_items(pending, p, flags);
return true;
}
return false;
}
static void rcu_pending_work(struct work_struct *work)
{
struct rcu_pending_pcpu *p =
container_of(work, struct rcu_pending_pcpu, work);
struct rcu_pending *pending = p->parent;
unsigned long flags;
do {
spin_lock_irqsave(&p->lock, flags);
} while (process_finished_items(pending, p, flags));
spin_unlock_irqrestore(&p->lock, flags);
}
static void rcu_pending_rcu_cb(struct rcu_head *rcu)
{
struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
schedule_work_on(p->cpu, &p->work);
unsigned long flags;
spin_lock_irqsave(&p->lock, flags);
if (__rcu_pending_has_pending(p)) {
spin_unlock_irqrestore(&p->lock, flags);
__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
} else {
p->cb_armed = false;
spin_unlock_irqrestore(&p->lock, flags);
}
}
static __always_inline struct rcu_pending_seq *
get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
{
darray_for_each_reverse(p->objs, objs)
if (objs->seq == seq)
return objs;
if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
return NULL;
return &darray_last(p->objs);
}
static noinline bool
rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
struct rcu_head *head, void *ptr,
unsigned long *flags)
{
if (ptr) {
if (!head) {
/*
* kvfree_rcu_mightsleep(): we weren't passed an
* rcu_head, but we need one: use the low bit of the
* ponter to free to flag that the head needs to be
* freed as well:
*/
ptr = (void *)(((unsigned long) ptr)|1UL);
head = kmalloc(sizeof(*head), __GFP_NOWARN);
if (!head) {
spin_unlock_irqrestore(&p->lock, *flags);
head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
/*
* dropped lock, did GFP_KERNEL allocation,
* check for gp expiration
*/
if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
kvfree(--ptr);
kfree(head);
spin_lock_irqsave(&p->lock, *flags);
return false;
}
}
}
head->func = ptr;
}
again:
for (struct rcu_pending_list *i = p->lists;
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
if (i->seq == seq) {
rcu_pending_list_add(i, head);
return false;
}
}
for (struct rcu_pending_list *i = p->lists;
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
if (!i->head) {
i->seq = seq;
rcu_pending_list_add(i, head);
return true;
}
}
merge_expired_lists(p);
goto again;
}
/*
* __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
* pending->pracess) once grace period elapses.
*
* Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
* back to a linked list.
*
* - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
* process callback
*
* - If @ptr and @head are both not NULL, we're kvfree_rcu()
*
* - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
*
* - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
* expired items.
*/
static __always_inline void
__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
void *ptr, bool may_sleep)
{
struct rcu_pending_pcpu *p;
struct rcu_pending_seq *objs;
struct genradix_node *new_node = NULL;
unsigned long seq, flags;
bool start_gp = false;
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
local_irq_save(flags);
p = this_cpu_ptr(pending->p);
spin_lock(&p->lock);
seq = __get_state_synchronize_rcu(pending->srcu);
restart:
if (may_sleep &&
unlikely(process_finished_items(pending, p, flags)))
goto check_expired;
/*
* In kvfree_rcu() mode, the radix tree is only for slab pointers so
* that we can do kfree_bulk() - vmalloc pointers always use the linked
* list:
*/
if (ptr && unlikely(is_vmalloc_addr(ptr)))
goto list_add;
objs = get_object_radix(p, seq);
if (unlikely(!objs))
goto list_add;
if (unlikely(!objs->cursor)) {
/*
* New radix tree nodes must be added under @p->lock because the
* tree root is in a darray that can be resized (typically,
* genradix supports concurrent unlocked allocation of new
* nodes) - hence preallocation and the retry loop:
*/
objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
if (unlikely(!objs->cursor)) {
if (may_sleep) {
spin_unlock_irqrestore(&p->lock, flags);
gfp_t gfp = GFP_KERNEL;
if (!head)
gfp |= __GFP_NOFAIL;
new_node = genradix_alloc_node(gfp);
if (!new_node)
may_sleep = false;
goto check_expired;
}
list_add:
start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
goto start_gp;
}
}
*objs->cursor++ = ptr ?: head;
/* zero cursor if we hit the end of a radix tree node: */
if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
objs->cursor = NULL;
start_gp = !objs->nr;
objs->nr++;
start_gp:
if (unlikely(start_gp)) {
/*
* We only have one callback (ideally, we would have one for
* every outstanding graceperiod) - so if our callback is
* already in flight, we may still have to start a grace period
* (since we used get_state() above, not start_poll())
*/
if (!p->cb_armed) {
p->cb_armed = true;
__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
} else {
__start_poll_synchronize_rcu(pending->srcu);
}
}
spin_unlock_irqrestore(&p->lock, flags);
free_node:
if (new_node)
genradix_free_node(new_node);
return;
check_expired:
if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
switch ((ulong) pending->process) {
case RCU_PENDING_KVFREE:
kvfree(ptr);
break;
case RCU_PENDING_CALL_RCU:
head->func(head);
break;
default:
pending->process(pending, head);
break;
}
goto free_node;
}
local_irq_save(flags);
p = this_cpu_ptr(pending->p);
spin_lock(&p->lock);
goto restart;
}
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
{
__rcu_pending_enqueue(pending, obj, NULL, true);
}
static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
{
struct rcu_head *ret = NULL;
spin_lock_irq(&p->lock);
darray_for_each(p->objs, objs)
if (objs->nr) {
ret = *genradix_ptr(&objs->objs, --objs->nr);
objs->cursor = NULL;
if (!objs->nr)
genradix_free(&objs->objs);
goto out;
}
static_array_for_each(p->lists, i)
if (i->head) {
ret = i->head;
#ifdef __KERNEL__
i->head = ret->next;
#else
i->head = (void *) ret->next.next;
#endif
if (!i->head)
i->tail = NULL;
goto out;
}
out:
spin_unlock_irq(&p->lock);
return ret;
}
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
{
return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
}
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
{
struct rcu_head *ret = rcu_pending_dequeue(pending);
if (ret)
return ret;
int cpu;
for_each_possible_cpu(cpu) {
ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
if (ret)
break;
}
return ret;
}
static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
{
int cpu;
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
spin_lock_irq(&p->lock);
if (__rcu_pending_has_pending(p) || p->cb_armed) {
spin_unlock_irq(&p->lock);
return true;
}
spin_unlock_irq(&p->lock);
}
return false;
}
void rcu_pending_exit(struct rcu_pending *pending)
{
int cpu;
if (!pending->p)
return;
while (rcu_pending_has_pending_or_armed(pending)) {
__rcu_barrier(pending->srcu);
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
flush_work(&p->work);
}
}
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
flush_work(&p->work);
}
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
static_array_for_each(p->lists, i)
WARN_ON(i->head);
WARN_ON(p->objs.nr);
darray_exit(&p->objs);
}
free_percpu(pending->p);
}
/**
* rcu_pending_init: - initialize a rcu_pending
*
* @pending: Object to init
* @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
* RCU flavor
* @process: Callback function invoked on objects once their RCU barriers
* have completed; if NULL, kvfree() is used.
*/
int rcu_pending_init(struct rcu_pending *pending,
struct srcu_struct *srcu,
rcu_pending_process_fn process)
{
pending->p = alloc_percpu(struct rcu_pending_pcpu);
if (!pending->p)
return -ENOMEM;
int cpu;
for_each_possible_cpu(cpu) {
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
p->parent = pending;
p->cpu = cpu;
spin_lock_init(&p->lock);
darray_init(&p->objs);
INIT_WORK(&p->work, rcu_pending_work);
}
pending->srcu = srcu;
pending->process = process;
return 0;
}

27
fs/bcachefs/rcu_pending.h Normal file
View File

@ -0,0 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCU_PENDING_H
#define _LINUX_RCU_PENDING_H
#include <linux/rcupdate.h>
struct rcu_pending;
typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
struct rcu_pending_pcpu;
struct rcu_pending {
struct rcu_pending_pcpu __percpu *p;
struct srcu_struct *srcu;
rcu_pending_process_fn process;
};
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
void rcu_pending_exit(struct rcu_pending *pending);
int rcu_pending_init(struct rcu_pending *pending,
struct srcu_struct *srcu,
rcu_pending_process_fn process);
#endif /* _LINUX_RCU_PENDING_H */

View File

@ -122,7 +122,7 @@ static void extent_to_replicas(struct bkey_s_c k,
continue;
if (!p.has_ec)
r->devs[r->nr_devs++] = p.ptr.dev;
replicas_entry_add_dev(r, p.ptr.dev);
else
r->nr_required = 0;
}
@ -139,7 +139,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
for (ptr = s.v->ptrs;
ptr < s.v->ptrs + s.v->nr_blocks;
ptr++)
r->devs[r->nr_devs++] = ptr->dev;
replicas_entry_add_dev(r, ptr->dev);
}
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
@ -180,7 +180,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
e->nr_required = 1;
darray_for_each(devs, i)
e->devs[e->nr_devs++] = *i;
replicas_entry_add_dev(e, *i);
bch2_replicas_entry_sort(e);
}

View File

@ -5,7 +5,7 @@
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
__u8 devs[];
__u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas_v0 {
@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
__u8 devs[];
__u8 devs[] __counted_by(nr_devs);
} __packed;
struct bch_sb_field_replicas {
@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
#define replicas_entry_add_dev(e, d) ({ \
(e)->nr_devs++; \
(e)->devs[(e)->nr_devs - 1] = (d); \
})
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */

View File

@ -23,7 +23,7 @@ enum bch_fsck_flags {
x(jset_past_bucket_end, 9, 0) \
x(jset_seq_blacklisted, 10, 0) \
x(journal_entries_missing, 11, 0) \
x(journal_entry_replicas_not_marked, 12, 0) \
x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
x(journal_entry_past_jset_end, 13, 0) \
x(journal_entry_replicas_data_mismatch, 14, 0) \
x(journal_entry_bkey_u64s_0, 15, 0) \

View File

@ -464,3 +464,12 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
}
}
unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
{
unsigned nr = 0;
for (unsigned i = 0; i < sb->nr_devices; i++)
nr += bch2_member_exists((struct bch_sb *) sb, i);
return nr;
}

View File

@ -307,6 +307,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
return false;
}
unsigned bch2_sb_nr_devices(const struct bch_sb *);
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {

View File

@ -31,6 +31,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
static inline struct bkey_s_c
bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
u32 subvolid, unsigned flags)
{
u32 snapshot;
int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
if (ret)
return bkey_s_c_err(ret);
bch2_btree_iter_set_snapshot(iter, snapshot);
return bch2_btree_iter_peek_upto_type(iter, end, flags);
}
#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
_end, _subvolid, _flags, _k, _do) \
({ \
struct bkey_s_c _k; \
int _ret3 = 0; \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
(_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \
_end, _subvolid, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
} while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
})
#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \
_start, _end, _subvolid, _flags, _k, _do) \
({ \
struct btree_iter _iter; \
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
_end, _subvolid, _flags, _k, _do); \
})
int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);

View File

@ -30,7 +30,8 @@ struct snapshot_table {
};
typedef struct {
u32 subvol;
/* we can't have padding in this struct: */
u64 subvol;
u64 inum;
} subvol_inum;

View File

@ -418,6 +418,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
!BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
@ -1292,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
u64 fields_have = 0;
unsigned nr_devices = 0;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 44);
for (int i = 0; i < sb->nr_devices; i++)
nr_devices += bch2_member_exists(sb, i);
prt_printf(out, "External UUID:\t");
pr_uuid(out, sb->user_uuid.b);
prt_newline(out);
@ -1356,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_newline(out);
prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
prt_printf(out, "Devices:\t%u\n", nr_devices);
prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
prt_printf(out, "Sections:\t");
u64 fields_have = 0;
vstruct_for_each(sb, f)
fields_have |= 1 << le32_to_cpu(f->type);
prt_bitflags(out, bch2_sb_fields, fields_have);

View File

@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
bch2_fs_vfs_exit(c);
bch2_fs_ec_exit(c);
bch2_fs_encryption_exit(c);
bch2_fs_nocow_locking_exit(c);
@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->copy_gc_enabled = 1;
c->rebalance.enabled = 1;
c->promote_whole_extents = true;
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_encryption_init(c) ?:
bch2_fs_compress_init(c) ?:
bch2_fs_ec_init(c) ?:
bch2_fs_vfs_init(c) ?:
bch2_fs_fsio_init(c) ?:
bch2_fs_fs_io_buffered_init(c) ?:
bch2_fs_fs_io_direct_init(c);

View File

@ -219,7 +219,6 @@ read_attribute(copy_gc_wait);
rw_attribute(rebalance_enabled);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
rw_attribute(promote_whole_extents);
read_attribute(new_stripes);
@ -234,7 +233,7 @@ write_attribute(perf_test);
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = 0444 };
{ .name = #_name, .mode = 0644 };
BCH_TIME_STATS()
#undef x
@ -347,8 +346,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
sysfs_print(promote_whole_extents, c->promote_whole_extents);
/* Debugging: */
if (attr == &sysfs_journal_debug)
@ -436,8 +433,6 @@ STORE(bch2_fs)
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
/* Debugging: */
if (!test_bit(BCH_FS_started, &c->flags))
@ -514,7 +509,7 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_cache_size,
&sysfs_btree_write_stats,
&sysfs_promote_whole_extents,
&sysfs_rebalance_status,
&sysfs_compression_stats,
@ -614,7 +609,6 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_copy_gc_wait,
&sysfs_rebalance_enabled,
&sysfs_rebalance_status,
sysfs_pd_controller_files(rebalance),
&sysfs_moving_ctxts,
@ -674,7 +668,7 @@ STORE(bch2_fs_opts_dir)
if (ret < 0)
goto err;
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_sb(c, NULL, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
if (v &&
@ -728,6 +722,13 @@ SHOW(bch2_fs_time_stats)
STORE(bch2_fs_time_stats)
{
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
#define x(name) \
if (attr == &sysfs_time_stat_##name) \
bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
BCH_TIME_STATS()
#undef x
return size;
}
SYSFS_OPS(bch2_fs_time_stats);
@ -821,32 +822,17 @@ STORE(bch2_dev)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
struct bch_member *mi;
if (attr == &sysfs_discard) {
bool v = strtoul_or_return(buf);
mutex_lock(&c->sb_lock);
mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
if (v != BCH_MEMBER_DISCARD(mi)) {
SET_BCH_MEMBER_DISCARD(mi, v);
bch2_write_super(c);
}
mutex_unlock(&c->sb_lock);
bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
}
if (attr == &sysfs_durability) {
u64 v = strtoul_or_return(buf);
mutex_lock(&c->sb_lock);
mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
SET_BCH_MEMBER_DURABILITY(mi, v + 1);
bch2_write_super(c);
}
mutex_unlock(&c->sb_lock);
bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
}
if (attr == &sysfs_label) {

View File

@ -387,7 +387,7 @@ again:
seen = buf->buf.nr;
char *n = memchr(buf->buf.data, '\n', seen);
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
spin_unlock(&buf->lock);
return -ETIME;
}

View File

@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
}
}
void bch2_time_stats_reset(struct bch2_time_stats *stats)
{
spin_lock_irq(&stats->lock);
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
if (stats->buffer) {
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
}
spin_unlock_irq(&stats->lock);
}
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);

View File

@ -70,6 +70,7 @@ struct time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
struct time_stat_buffer __percpu *buffer;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
@ -87,7 +88,6 @@ struct bch2_time_stats {
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
struct time_stat_buffer __percpu *buffer;
};
struct bch2_time_stats_quantiles {
@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
return false;
}
void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);

View File

@ -3,7 +3,6 @@
#define TRACE_SYSTEM bcachefs
#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_BCACHEFS_H
#include <linux/tracepoint.h>
@ -558,6 +557,7 @@ TRACE_EVENT(btree_path_relock_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
__field(u8, path_idx)
TRACE_BPOS_entries(pos)
__array(char, node, 24 )
__field(u8, self_read_count )
@ -575,7 +575,8 @@ TRACE_EVENT(btree_path_relock_fail,
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
__entry->level = path->level;
__entry->level = level;
__entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
@ -588,7 +589,7 @@ TRACE_EVENT(btree_path_relock_fail,
c = six_lock_counts(&path->l[level].b->c.lock);
__entry->read_count = c.n[SIX_LOCK_read];
__entry->intent_count = c.n[SIX_LOCK_intent];
scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
}
__entry->iter_lock_seq = path->l[level].lock_seq;
__entry->node_lock_seq = is_btree_node(path, level)
@ -596,9 +597,10 @@ TRACE_EVENT(btree_path_relock_fail,
: 0;
),
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
__entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@ -625,6 +627,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
__field(u8, path_idx)
TRACE_BPOS_entries(pos)
__field(u8, locked )
__field(u8, self_read_count )
@ -642,6 +645,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
__entry->caller_ip = caller_ip;
__entry->btree_id = path->btree_id;
__entry->level = level;
__entry->path_idx = path - trans->paths;
TRACE_BPOS_assign(pos, path->pos);
__entry->locked = btree_node_locked(path, level);
@ -657,9 +661,10 @@ TRACE_EVENT(btree_path_upgrade_fail,
: 0;
),
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
__entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
@ -1438,6 +1443,456 @@ TRACE_EVENT(error_downcast,
TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
);
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
TRACE_EVENT(update_by_path,
TP_PROTO(struct btree_trans *trans, struct btree_path *path,
struct btree_insert_entry *i, bool overwrite),
TP_ARGS(trans, path, i, overwrite),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(btree_path_idx_t, path_idx )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
__field(u8, overwrite )
__field(btree_path_idx_t, update_idx )
__field(btree_path_idx_t, nr_updates )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->path_idx = path - trans->paths;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(pos, path->pos);
__entry->overwrite = overwrite;
__entry->update_idx = i - trans->updates;
__entry->nr_updates = trans->nr_updates;
),
TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
__entry->trans_fn,
__entry->path_idx,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->overwrite,
__entry->update_idx,
__entry->nr_updates)
);
TRACE_EVENT(btree_path_lock,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_bkey_cached_common *b),
TP_ARGS(trans, caller_ip, b),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
__field(u8, btree_id )
__field(u8, level )
__array(char, node, 24 )
__field(u32, lock_seq )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
__entry->btree_id = b->btree_id;
__entry->level = b->level;
scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
__entry->lock_seq = six_lock_seq(&b->lock);
),
TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_id_str(__entry->btree_id),
__entry->level,
__entry->node,
__entry->lock_seq)
);
DECLARE_EVENT_CLASS(btree_path_ev,
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
TP_ARGS(trans, path),
TP_STRUCT__entry(
__field(u16, idx )
__field(u8, ref )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
),
TP_fast_assign(
__entry->idx = path - trans->paths;
__entry->ref = path->ref;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(pos, path->pos);
),
TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
__entry->idx, __entry->ref,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot)
);
DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
TP_ARGS(trans, path)
);
DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
TP_ARGS(trans, path)
);
DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
TP_ARGS(trans, path)
);
TRACE_EVENT(btree_path_alloc,
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
TP_ARGS(trans, path),
TP_STRUCT__entry(
__field(btree_path_idx_t, idx )
__field(u8, locks_want )
__field(u8, btree_id )
TRACE_BPOS_entries(pos)
),
TP_fast_assign(
__entry->idx = path - trans->paths;
__entry->locks_want = path->locks_want;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(pos, path->pos);
),
TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
__entry->idx,
bch2_btree_id_str(__entry->btree_id),
__entry->locks_want,
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot)
);
TRACE_EVENT(btree_path_get,
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
TP_ARGS(trans, path, new_pos),
TP_STRUCT__entry(
__field(btree_path_idx_t, idx )
__field(u8, ref )
__field(u8, preserve )
__field(u8, locks_want )
__field(u8, btree_id )
TRACE_BPOS_entries(old_pos)
TRACE_BPOS_entries(new_pos)
),
TP_fast_assign(
__entry->idx = path - trans->paths;
__entry->ref = path->ref;
__entry->preserve = path->preserve;
__entry->locks_want = path->locks_want;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(old_pos, path->pos);
TRACE_BPOS_assign(new_pos, *new_pos);
),
TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
__entry->idx,
__entry->ref,
__entry->preserve,
bch2_btree_id_str(__entry->btree_id),
__entry->locks_want,
__entry->old_pos_inode,
__entry->old_pos_offset,
__entry->old_pos_snapshot,
__entry->new_pos_inode,
__entry->new_pos_offset,
__entry->new_pos_snapshot)
);
DECLARE_EVENT_CLASS(btree_path_clone,
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
TP_ARGS(trans, path, new),
TP_STRUCT__entry(
__field(btree_path_idx_t, idx )
__field(u8, new_idx )
__field(u8, btree_id )
__field(u8, ref )
__field(u8, preserve )
TRACE_BPOS_entries(pos)
),
TP_fast_assign(
__entry->idx = path - trans->paths;
__entry->new_idx = new - trans->paths;
__entry->btree_id = path->btree_id;
__entry->ref = path->ref;
__entry->preserve = path->preserve;
TRACE_BPOS_assign(pos, path->pos);
),
TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
__entry->idx,
__entry->ref,
__entry->preserve,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->new_idx)
);
DEFINE_EVENT(btree_path_clone, btree_path_clone,
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
TP_ARGS(trans, path, new)
);
DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
TP_ARGS(trans, path, new)
);
DECLARE_EVENT_CLASS(btree_path_traverse,
TP_PROTO(struct btree_trans *trans,
struct btree_path *path),
TP_ARGS(trans, path),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(btree_path_idx_t, idx )
__field(u8, ref )
__field(u8, preserve )
__field(u8, should_be_locked )
__field(u8, btree_id )
__field(u8, level )
TRACE_BPOS_entries(pos)
__field(u8, locks_want )
__field(u8, nodes_locked )
__array(char, node0, 24 )
__array(char, node1, 24 )
__array(char, node2, 24 )
__array(char, node3, 24 )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->idx = path - trans->paths;
__entry->ref = path->ref;
__entry->preserve = path->preserve;
__entry->btree_id = path->btree_id;
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
__entry->locks_want = path->locks_want;
__entry->nodes_locked = path->nodes_locked;
struct btree *b = path->l[0].b;
if (IS_ERR(b))
strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
b = path->l[1].b;
if (IS_ERR(b))
strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
b = path->l[2].b;
if (IS_ERR(b))
strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
b = path->l[3].b;
if (IS_ERR(b))
strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
),
TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
"locks %u %u %u %u node %s %s %s %s",
__entry->trans_fn,
__entry->idx,
__entry->ref,
__entry->preserve,
bch2_btree_id_str(__entry->btree_id),
__entry->pos_inode,
__entry->pos_offset,
__entry->pos_snapshot,
__entry->level,
__entry->locks_want,
(__entry->nodes_locked >> 6) & 3,
(__entry->nodes_locked >> 4) & 3,
(__entry->nodes_locked >> 2) & 3,
(__entry->nodes_locked >> 0) & 3,
__entry->node3,
__entry->node2,
__entry->node1,
__entry->node0)
);
DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
TP_PROTO(struct btree_trans *trans,
struct btree_path *path),
TP_ARGS(trans, path)
);
DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
TP_ARGS(trans, path)
);
TRACE_EVENT(btree_path_set_pos,
TP_PROTO(struct btree_trans *trans,
struct btree_path *path,
struct bpos *new_pos),
TP_ARGS(trans, path, new_pos),
TP_STRUCT__entry(
__field(btree_path_idx_t, idx )
__field(u8, ref )
__field(u8, preserve )
__field(u8, btree_id )
TRACE_BPOS_entries(old_pos)
TRACE_BPOS_entries(new_pos)
__field(u8, locks_want )
__field(u8, nodes_locked )
__array(char, node0, 24 )
__array(char, node1, 24 )
__array(char, node2, 24 )
__array(char, node3, 24 )
),
TP_fast_assign(
__entry->idx = path - trans->paths;
__entry->ref = path->ref;
__entry->preserve = path->preserve;
__entry->btree_id = path->btree_id;
TRACE_BPOS_assign(old_pos, path->pos);
TRACE_BPOS_assign(new_pos, *new_pos);
__entry->nodes_locked = path->nodes_locked;
struct btree *b = path->l[0].b;
if (IS_ERR(b))
strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
b = path->l[1].b;
if (IS_ERR(b))
strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
b = path->l[2].b;
if (IS_ERR(b))
strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
b = path->l[3].b;
if (IS_ERR(b))
strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
else
scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
),
TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
"locks %u %u %u %u node %s %s %s %s",
__entry->idx,
__entry->ref,
__entry->preserve,
bch2_btree_id_str(__entry->btree_id),
__entry->old_pos_inode,
__entry->old_pos_offset,
__entry->old_pos_snapshot,
__entry->new_pos_inode,
__entry->new_pos_offset,
__entry->new_pos_snapshot,
(__entry->nodes_locked >> 6) & 3,
(__entry->nodes_locked >> 4) & 3,
(__entry->nodes_locked >> 2) & 3,
(__entry->nodes_locked >> 0) & 3,
__entry->node3,
__entry->node2,
__entry->node1,
__entry->node0)
);
TRACE_EVENT(btree_path_free,
TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
TP_ARGS(trans, path, dup),
TP_STRUCT__entry(
__field(btree_path_idx_t, idx )
__field(u8, preserve )
__field(u8, should_be_locked)
__field(s8, dup )
__field(u8, dup_locked )
),
TP_fast_assign(
__entry->idx = path;
__entry->preserve = trans->paths[path].preserve;
__entry->should_be_locked = trans->paths[path].should_be_locked;
__entry->dup = dup ? dup - trans->paths : -1;
__entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
),
TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
__entry->preserve ? 'P' : ' ',
__entry->should_be_locked ? 'S' : ' ',
__entry->dup,
__entry->dup_locked)
);
TRACE_EVENT(btree_path_free_trans_begin,
TP_PROTO(btree_path_idx_t path),
TP_ARGS(path),
TP_STRUCT__entry(
__field(btree_path_idx_t, idx )
),
TP_fast_assign(
__entry->idx = path;
),
TP_printk(" path %3u", __entry->idx)
);
#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
#ifndef _TRACE_BCACHEFS_H
static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
struct btree_insert_entry *i, bool overwrite) {}
static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
#endif
#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
#define _TRACE_BCACHEFS_H
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */

View File

@ -204,7 +204,7 @@ STRTO_H(strtoll, long long)
STRTO_H(strtoull, unsigned long long)
STRTO_H(strtou64, u64)
u64 bch2_read_flag_list(char *opt, const char * const list[])
u64 bch2_read_flag_list(const char *opt, const char * const list[])
{
u64 ret = 0;
char *p, *s, *d = kstrdup(opt, GFP_KERNEL);

View File

@ -195,7 +195,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);

View File

@ -250,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix,
return 0;
}
static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
{
const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
if (!xattr_handler_can_list(handler, dentry))
return NULL;
return xattr_prefix(handler);
}
static int bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr,
struct xattr_buf *buf)
{
const struct xattr_handler *handler =
bch2_xattr_type_to_handler(xattr->x_type);
const char *prefix;
return handler && (!handler->list || handler->list(dentry))
? __bch2_xattr_emit(handler->prefix ?: handler->name,
xattr->x_name, xattr->x_name_len, buf)
: 0;
prefix = bch2_xattr_prefix(xattr->x_type, dentry);
if (!prefix)
return 0;
return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
@ -295,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct bch_fs *c = dentry->d_sb->s_fs_info;
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
u64 offset = 0, inum = inode->ei_inode.bi_inum;
u32 snapshot;
int ret;
retry:
bch2_trans_begin(trans);
iter = (struct btree_iter) { NULL };
ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
if (ret)
goto err;
int ret = bch2_trans_run(c,
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
POS(inum, offset),
POS(inum, U64_MAX),
inode->ei_inum.subvol, 0, k, ({
if (k.k->type != KEY_TYPE_xattr)
continue;
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
SPOS(inum, offset, snapshot),
POS(inum, U64_MAX), 0, k, ret) {
if (k.k->type != KEY_TYPE_xattr)
continue;
bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
}))) ?:
bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
if (ret)
break;
}
offset = iter.pos.offset;
bch2_trans_iter_exit(trans, &iter);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
bch2_trans_put(trans);
if (ret)
goto out;
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
if (ret)
goto out;
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
if (ret)
goto out;
return buf.used;
out:
return bch2_err_class(ret);
return ret ? bch2_err_class(ret) : buf.used;
}
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
@ -632,10 +611,6 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
#ifdef CONFIG_BCACHEFS_POSIX_ACL
&nop_posix_acl_access,
&nop_posix_acl_default,
#endif
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
#ifndef NO_BCACHEFS_FS

View File

@ -13,7 +13,7 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
__u8 x_name[];
__u8 x_name[] __counted_by(x_name_len);
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */

View File

@ -438,14 +438,6 @@ static void init_once(void *foo)
inode_init_once(inode);
}
/*
* inode->i_lock must be held
*/
void __iget(struct inode *inode)
{
atomic_inc(&inode->i_count);
}
/*
* get additional reference to inode; caller must already hold one.
*/

View File

@ -3100,7 +3100,14 @@ static inline bool is_zero_ino(ino_t ino)
return (u32)ino == 0;
}
extern void __iget(struct inode * inode);
/*
* inode->i_lock must be held
*/
static inline void __iget(struct inode *inode)
{
atomic_inc(&inode->i_count);
}
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);

View File

@ -41,6 +41,7 @@
#include <linux/limits.h>
#include <linux/log2.h>
#include <linux/math.h>
#include <linux/slab.h>
#include <linux/types.h>
struct genradix_root;
@ -48,10 +49,63 @@ struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
static inline int genradix_depth_shift(unsigned depth)
{
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
/*
* Returns size (of data, in bytes) that a tree of a given depth holds:
*/
static inline size_t genradix_depth_size(unsigned depth)
{
return 1UL << genradix_depth_shift(depth);
}
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
struct __genradix {
struct genradix_root *root;
};
struct genradix_node {
union {
/* Interior node: */
struct genradix_node *children[GENRADIX_ARY];
/* Leaf: */
u8 data[GENRADIX_NODE_SIZE];
};
};
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}
static inline void genradix_free_node(struct genradix_node *node)
{
kfree(node);
}
/*
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
{
struct genradix_root *r = READ_ONCE(radix->root);
struct genradix_node *n = genradix_root_to_node(r);
unsigned level = genradix_root_to_depth(r);
unsigned shift = genradix_depth_shift(level);
if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
return NULL;
while (n && shift > GENRADIX_NODE_SHIFT) {
shift -= GENRADIX_ARY_SHIFT;
n = n->children[offset >> shift];
offset &= (1UL << shift) - 1;
}
return n ? &n->data[offset] : NULL;
}
#define genradix_ptr_inlined(_radix, _idx) \
(__genradix_cast(_radix) \
__genradix_ptr_inlined(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
void *__genradix_ptr(struct __genradix *, size_t);
/**
@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
__genradix_ptr(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)))
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
void *__genradix_ptr_alloc(struct __genradix *, size_t,
struct genradix_node **, gfp_t);
#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
(__genradix_cast(_radix) \
(__genradix_ptr_inlined(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)) ?: \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
NULL, _gfp)))
#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
(__genradix_cast(_radix) \
(__genradix_ptr_inlined(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx)) ?: \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
_new_node, _gfp)))
/**
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
_gfp))
NULL, _gfp))
#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
(__genradix_cast(_radix) \
__genradix_ptr_alloc(&(_radix)->tree, \
__genradix_idx_to_offset(_radix, _idx), \
_new_node, _gfp))
struct genradix_iter {
size_t offset;

View File

@ -5,99 +5,31 @@
#include <linux/gfp.h>
#include <linux/kmemleak.h>
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
struct genradix_node {
union {
/* Interior node: */
struct genradix_node *children[GENRADIX_ARY];
/* Leaf: */
u8 data[GENRADIX_NODE_SIZE];
};
};
static inline int genradix_depth_shift(unsigned depth)
{
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
/*
* Returns size (of data, in bytes) that a tree of a given depth holds:
*/
static inline size_t genradix_depth_size(unsigned depth)
{
return 1UL << genradix_depth_shift(depth);
}
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
{
return (unsigned long) r & GENRADIX_DEPTH_MASK;
}
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
{
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
}
/*
* Returns pointer to the specified byte @offset within @radix, or NULL if not
* allocated
*/
void *__genradix_ptr(struct __genradix *radix, size_t offset)
{
struct genradix_root *r = READ_ONCE(radix->root);
struct genradix_node *n = genradix_root_to_node(r);
unsigned level = genradix_root_to_depth(r);
if (ilog2(offset) >= genradix_depth_shift(level))
return NULL;
while (1) {
if (!n)
return NULL;
if (!level)
break;
level--;
n = n->children[offset >> genradix_depth_shift(level)];
offset &= genradix_depth_size(level) - 1;
}
return &n->data[offset];
return __genradix_ptr_inlined(radix, offset);
}
EXPORT_SYMBOL(__genradix_ptr);
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}
static inline void genradix_free_node(struct genradix_node *node)
{
kfree(node);
}
/*
* Returns pointer to the specified byte @offset within @radix, allocating it if
* necessary - newly allocated slots are always zeroed out:
*/
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
struct genradix_node **preallocated,
gfp_t gfp_mask)
{
struct genradix_root *v = READ_ONCE(radix->root);
struct genradix_node *n, *new_node = NULL;
unsigned level;
if (preallocated)
swap(new_node, *preallocated);
/* Increase tree depth if necessary: */
while (1) {
struct genradix_root *r = v, *new_root;
@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
size_t offset;
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
return -ENOMEM;
return 0;