mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-09-21 12:11:49 +08:00
Merge branch 'for-next' of git://evilpiepirate.org/bcachefs.git
This commit is contained in:
commit
7d0f928e8b
@ -175,7 +175,7 @@ errors in our thinking by running our code and seeing what happens. If your
|
||||
time is being wasted because your tools are bad or too slow - don't accept it,
|
||||
fix it.
|
||||
|
||||
Put effort into your documentation, commmit messages, and code comments - but
|
||||
Put effort into your documentation, commit messages, and code comments - but
|
||||
don't go overboard. A good commit message is wonderful - but if the information
|
||||
was important enough to go in a commit message, ask yourself if it would be
|
||||
even better as a code comment.
|
||||
|
@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
|
||||
is held by another thread, spin for a short while, as long as the
|
||||
thread owning the lock is running.
|
||||
|
||||
config BCACHEFS_PATH_TRACEPOINTS
|
||||
bool "Extra btree_path tracepoints"
|
||||
depends on BCACHEFS_FS
|
||||
help
|
||||
Enable extra tracepoints for debugging btree_path operations; we don't
|
||||
normally want these enabled because they happen at very high rates.
|
||||
|
||||
config MEAN_AND_VARIANCE_UNIT_TEST
|
||||
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
|
||||
depends on KUNIT
|
||||
|
@ -69,6 +69,7 @@ bcachefs-y := \
|
||||
printbuf.o \
|
||||
quota.o \
|
||||
rebalance.o \
|
||||
rcu_pending.o \
|
||||
recovery.o \
|
||||
recovery_passes.o \
|
||||
reflink.o \
|
||||
|
@ -361,7 +361,7 @@ retry:
|
||||
bch2_trans_begin(trans);
|
||||
acl = _acl;
|
||||
|
||||
ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?:
|
||||
ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?:
|
||||
bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
|
||||
BTREE_ITER_intent);
|
||||
if (ret)
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
static void bch2_discard_one_bucket_fast(struct bch_dev *, u64);
|
||||
|
||||
@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
|
||||
* freespace/need_discard/need_gc_gens btrees as needed:
|
||||
*/
|
||||
while (1) {
|
||||
if (last_updated + HZ * 10 < jiffies) {
|
||||
if (time_after(jiffies, last_updated + HZ * 10)) {
|
||||
bch_info(ca, "%s: currently at %llu/%llu",
|
||||
__func__, iter.pos.offset, ca->mi.nbuckets);
|
||||
last_updated = jiffies;
|
||||
|
@ -1022,9 +1022,6 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
|
||||
open_bucket_for_each(c, ptrs, ob, i)
|
||||
__clear_bit(ob->dev, devs.d);
|
||||
|
||||
if (erasure_code && ec_open_bucket(c, ptrs))
|
||||
return 0;
|
||||
|
||||
ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, erasure_code, flags);
|
||||
@ -1079,7 +1076,7 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (erasure_code) {
|
||||
if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
|
||||
ret = __open_bucket_add_buckets(trans, ptrs, wp,
|
||||
devs_have, target, erasure_code,
|
||||
nr_replicas, nr_effective, have_cache,
|
||||
|
@ -542,7 +542,7 @@ struct bch_dev {
|
||||
* gc_gens_lock, for device resize - holding any is sufficient for
|
||||
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
|
||||
*/
|
||||
struct bucket_array __rcu *buckets_gc;
|
||||
GENRADIX(struct bucket) buckets_gc;
|
||||
struct bucket_gens __rcu *bucket_gens;
|
||||
u8 *oldest_gen;
|
||||
unsigned long *buckets_nouse;
|
||||
@ -1023,6 +1023,7 @@ struct bch_fs {
|
||||
/* fs.c */
|
||||
struct list_head vfs_inodes_list;
|
||||
struct mutex vfs_inodes_lock;
|
||||
struct rhashtable vfs_inodes_table;
|
||||
|
||||
/* VFS IO PATH - fs-io.c */
|
||||
struct bio_set writepage_bioset;
|
||||
@ -1085,7 +1086,6 @@ struct bch_fs {
|
||||
u64 __percpu *counters;
|
||||
|
||||
unsigned copy_gc_enabled:1;
|
||||
bool promote_whole_extents;
|
||||
|
||||
struct bch2_time_stats times[BCH_TIME_STAT_NR];
|
||||
|
||||
|
@ -795,6 +795,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
|
||||
LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
|
||||
|
||||
LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
|
||||
LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
|
||||
struct bch_sb, flags[0], 63, 64);
|
||||
|
||||
LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
|
||||
LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
|
||||
|
@ -304,11 +304,6 @@ struct bkey_float {
|
||||
};
|
||||
#define BKEY_MANTISSA_BITS 16
|
||||
|
||||
static unsigned bkey_float_byte_offset(unsigned idx)
|
||||
{
|
||||
return idx * sizeof(struct bkey_float);
|
||||
}
|
||||
|
||||
struct ro_aux_tree {
|
||||
u8 nothing[0];
|
||||
struct bkey_float f[];
|
||||
@ -328,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
|
||||
return t->aux_data_offset;
|
||||
case BSET_RO_AUX_TREE:
|
||||
return t->aux_data_offset +
|
||||
DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
|
||||
t->size * sizeof(u8), 8);
|
||||
DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8);
|
||||
case BSET_RW_AUX_TREE:
|
||||
return t->aux_data_offset +
|
||||
DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
|
||||
@ -360,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
|
||||
return __aux_tree_base(b, t);
|
||||
}
|
||||
|
||||
static u8 *ro_aux_tree_prev(const struct btree *b,
|
||||
const struct bset_tree *t)
|
||||
{
|
||||
EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
|
||||
|
||||
return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
|
||||
}
|
||||
|
||||
static struct bkey_float *bkey_float(const struct btree *b,
|
||||
const struct bset_tree *t,
|
||||
unsigned idx)
|
||||
@ -479,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
|
||||
bkey_float(b, t, j)->key_offset);
|
||||
}
|
||||
|
||||
static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
|
||||
const struct bset_tree *t,
|
||||
unsigned j)
|
||||
{
|
||||
unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
|
||||
|
||||
return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
|
||||
}
|
||||
|
||||
static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
|
||||
const struct bset_tree *t)
|
||||
{
|
||||
@ -585,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
|
||||
}
|
||||
|
||||
static inline unsigned bkey_mantissa(const struct bkey_packed *k,
|
||||
const struct bkey_float *f,
|
||||
unsigned idx)
|
||||
const struct bkey_float *f)
|
||||
{
|
||||
u64 v;
|
||||
|
||||
@ -617,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
|
||||
struct bkey_packed *m = tree_to_bkey(b, t, j);
|
||||
struct bkey_packed *l = is_power_of_2(j)
|
||||
? min_key
|
||||
: tree_to_prev_bkey(b, t, j >> ffs(j));
|
||||
: tree_to_bkey(b, t, j >> ffs(j));
|
||||
struct bkey_packed *r = is_power_of_2(j + 1)
|
||||
? max_key
|
||||
: tree_to_bkey(b, t, j >> (ffz(j) + 1));
|
||||
@ -668,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t,
|
||||
EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
|
||||
|
||||
f->exponent = shift;
|
||||
mantissa = bkey_mantissa(m, f, j);
|
||||
mantissa = bkey_mantissa(m, f);
|
||||
|
||||
/*
|
||||
* If we've got garbage bits, set them to all 1s - it's legal for the
|
||||
@ -690,8 +666,7 @@ static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t)
|
||||
|
||||
static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t)
|
||||
{
|
||||
return __bset_tree_capacity(b, t) /
|
||||
(sizeof(struct bkey_float) + sizeof(u8));
|
||||
return __bset_tree_capacity(b, t) / sizeof(struct bkey_float);
|
||||
}
|
||||
|
||||
static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t)
|
||||
@ -720,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
|
||||
|
||||
static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
|
||||
{
|
||||
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
|
||||
struct bkey_packed *k = btree_bkey_first(b, t);
|
||||
struct bkey_i min_key, max_key;
|
||||
unsigned cacheline = 1;
|
||||
|
||||
@ -733,12 +708,12 @@ retry:
|
||||
return;
|
||||
}
|
||||
|
||||
t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
|
||||
t->extra = eytzinger1_extra(t->size - 1);
|
||||
|
||||
/* First we figure out where the first key in each cacheline is */
|
||||
eytzinger1_for_each(j, t->size - 1) {
|
||||
while (bkey_to_cacheline(b, t, k) < cacheline)
|
||||
prev = k, k = bkey_p_next(k);
|
||||
k = bkey_p_next(k);
|
||||
|
||||
if (k >= btree_bkey_last(b, t)) {
|
||||
/* XXX: this path sucks */
|
||||
@ -746,17 +721,12 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
ro_aux_tree_prev(b, t)[j] = prev->u64s;
|
||||
bkey_float(b, t, j)->key_offset =
|
||||
bkey_to_cacheline_offset(b, t, cacheline++, k);
|
||||
|
||||
EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
|
||||
EBUG_ON(tree_to_bkey(b, t, j) != k);
|
||||
}
|
||||
|
||||
while (k != btree_bkey_last(b, t))
|
||||
prev = k, k = bkey_p_next(k);
|
||||
|
||||
if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
|
||||
bkey_init(&min_key.k);
|
||||
min_key.k.p = b->data->min_key;
|
||||
@ -915,66 +885,18 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
|
||||
|
||||
/* Insert */
|
||||
|
||||
static void bch2_bset_fix_lookup_table(struct btree *b,
|
||||
static void rw_aux_tree_insert_entry(struct btree *b,
|
||||
struct bset_tree *t,
|
||||
struct bkey_packed *_where,
|
||||
unsigned clobber_u64s,
|
||||
unsigned new_u64s)
|
||||
unsigned idx)
|
||||
{
|
||||
int shift = new_u64s - clobber_u64s;
|
||||
unsigned l, j, where = __btree_node_key_to_offset(b, _where);
|
||||
|
||||
EBUG_ON(bset_has_ro_aux_tree(t));
|
||||
|
||||
if (!bset_has_rw_aux_tree(t))
|
||||
return;
|
||||
|
||||
/* returns first entry >= where */
|
||||
l = rw_aux_tree_bsearch(b, t, where);
|
||||
|
||||
if (!l) /* never delete first entry */
|
||||
l++;
|
||||
else if (l < t->size &&
|
||||
where < t->end_offset &&
|
||||
rw_aux_tree(b, t)[l].offset == where)
|
||||
rw_aux_tree_set(b, t, l++, _where);
|
||||
|
||||
/* l now > where */
|
||||
|
||||
for (j = l;
|
||||
j < t->size &&
|
||||
rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
|
||||
j++)
|
||||
;
|
||||
|
||||
if (j < t->size &&
|
||||
rw_aux_tree(b, t)[j].offset + shift ==
|
||||
rw_aux_tree(b, t)[l - 1].offset)
|
||||
j++;
|
||||
|
||||
memmove(&rw_aux_tree(b, t)[l],
|
||||
&rw_aux_tree(b, t)[j],
|
||||
(void *) &rw_aux_tree(b, t)[t->size] -
|
||||
(void *) &rw_aux_tree(b, t)[j]);
|
||||
t->size -= j - l;
|
||||
|
||||
for (j = l; j < t->size; j++)
|
||||
rw_aux_tree(b, t)[j].offset += shift;
|
||||
|
||||
EBUG_ON(l < t->size &&
|
||||
rw_aux_tree(b, t)[l].offset ==
|
||||
rw_aux_tree(b, t)[l - 1].offset);
|
||||
EBUG_ON(!idx || idx > t->size);
|
||||
struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1);
|
||||
struct bkey_packed *end = idx < t->size
|
||||
? rw_aux_to_bkey(b, t, idx)
|
||||
: btree_bkey_last(b, t);
|
||||
|
||||
if (t->size < bset_rw_tree_capacity(b, t) &&
|
||||
(l < t->size
|
||||
? rw_aux_tree(b, t)[l].offset
|
||||
: t->end_offset) -
|
||||
rw_aux_tree(b, t)[l - 1].offset >
|
||||
L1_CACHE_BYTES / sizeof(u64)) {
|
||||
struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
|
||||
struct bkey_packed *end = l < t->size
|
||||
? rw_aux_to_bkey(b, t, l)
|
||||
: btree_bkey_last(b, t);
|
||||
(void *) end - (void *) start > L1_CACHE_BYTES) {
|
||||
struct bkey_packed *k = start;
|
||||
|
||||
while (1) {
|
||||
@ -983,23 +905,78 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
|
||||
break;
|
||||
|
||||
if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
|
||||
memmove(&rw_aux_tree(b, t)[l + 1],
|
||||
&rw_aux_tree(b, t)[l],
|
||||
memmove(&rw_aux_tree(b, t)[idx + 1],
|
||||
&rw_aux_tree(b, t)[idx],
|
||||
(void *) &rw_aux_tree(b, t)[t->size] -
|
||||
(void *) &rw_aux_tree(b, t)[l]);
|
||||
(void *) &rw_aux_tree(b, t)[idx]);
|
||||
t->size++;
|
||||
rw_aux_tree_set(b, t, l, k);
|
||||
rw_aux_tree_set(b, t, idx, k);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_bset_fix_lookup_table(struct btree *b,
|
||||
struct bset_tree *t,
|
||||
struct bkey_packed *_where,
|
||||
unsigned clobber_u64s,
|
||||
unsigned new_u64s)
|
||||
{
|
||||
int shift = new_u64s - clobber_u64s;
|
||||
unsigned idx, j, where = __btree_node_key_to_offset(b, _where);
|
||||
|
||||
EBUG_ON(bset_has_ro_aux_tree(t));
|
||||
|
||||
if (!bset_has_rw_aux_tree(t))
|
||||
return;
|
||||
|
||||
if (where > rw_aux_tree(b, t)[t->size - 1].offset) {
|
||||
rw_aux_tree_insert_entry(b, t, t->size);
|
||||
goto verify;
|
||||
}
|
||||
|
||||
/* returns first entry >= where */
|
||||
idx = rw_aux_tree_bsearch(b, t, where);
|
||||
|
||||
if (rw_aux_tree(b, t)[idx].offset == where) {
|
||||
if (!idx) { /* never delete first entry */
|
||||
idx++;
|
||||
} else if (where < t->end_offset) {
|
||||
rw_aux_tree_set(b, t, idx++, _where);
|
||||
} else {
|
||||
EBUG_ON(where != t->end_offset);
|
||||
rw_aux_tree_insert_entry(b, t, --t->size);
|
||||
goto verify;
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where);
|
||||
if (idx < t->size &&
|
||||
rw_aux_tree(b, t)[idx].offset + shift ==
|
||||
rw_aux_tree(b, t)[idx - 1].offset) {
|
||||
memmove(&rw_aux_tree(b, t)[idx],
|
||||
&rw_aux_tree(b, t)[idx + 1],
|
||||
(void *) &rw_aux_tree(b, t)[t->size] -
|
||||
(void *) &rw_aux_tree(b, t)[idx + 1]);
|
||||
t->size -= 1;
|
||||
}
|
||||
|
||||
for (j = idx; j < t->size; j++)
|
||||
rw_aux_tree(b, t)[j].offset += shift;
|
||||
|
||||
EBUG_ON(idx < t->size &&
|
||||
rw_aux_tree(b, t)[idx].offset ==
|
||||
rw_aux_tree(b, t)[idx - 1].offset);
|
||||
|
||||
rw_aux_tree_insert_entry(b, t, idx);
|
||||
|
||||
verify:
|
||||
bch2_bset_verify_rw_aux_tree(b, t);
|
||||
bset_aux_tree_verify(b);
|
||||
}
|
||||
|
||||
void bch2_bset_insert(struct btree *b,
|
||||
struct btree_node_iter *iter,
|
||||
struct bkey_packed *where,
|
||||
struct bkey_i *insert,
|
||||
unsigned clobber_u64s)
|
||||
@ -1098,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p)
|
||||
}
|
||||
|
||||
static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
|
||||
const struct bkey_float *f,
|
||||
unsigned idx)
|
||||
const struct bkey_float *f)
|
||||
{
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
|
||||
@ -1133,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b,
|
||||
goto slowpath;
|
||||
|
||||
l = f->mantissa;
|
||||
r = bkey_mantissa(packed_search, f, n);
|
||||
r = bkey_mantissa(packed_search, f);
|
||||
|
||||
if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
|
||||
if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f))
|
||||
goto slowpath;
|
||||
|
||||
n = n * 2 + (l < r);
|
||||
|
@ -270,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *);
|
||||
void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
|
||||
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
|
||||
|
||||
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
|
||||
struct bkey_packed *, struct bkey_i *, unsigned);
|
||||
void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *,
|
||||
unsigned);
|
||||
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
|
||||
|
||||
/* Bkey utility code */
|
||||
|
@ -671,9 +671,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
|
||||
: &bc->freed_nonpcpu;
|
||||
struct btree *b, *b2;
|
||||
u64 start_time = local_clock();
|
||||
unsigned flags;
|
||||
|
||||
flags = memalloc_nofs_save();
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
/*
|
||||
@ -745,8 +743,6 @@ out:
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
||||
start_time);
|
||||
|
||||
memalloc_nofs_restore(flags);
|
||||
|
||||
int ret = bch2_trans_relock(trans);
|
||||
if (unlikely(ret)) {
|
||||
bch2_btree_node_to_freelist(c, b);
|
||||
@ -781,7 +777,6 @@ err:
|
||||
}
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
memalloc_nofs_restore(flags);
|
||||
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
|
||||
}
|
||||
|
||||
|
@ -753,10 +753,8 @@ static void bch2_gc_free(struct bch_fs *c)
|
||||
genradix_free(&c->reflink_gc_table);
|
||||
genradix_free(&c->gc_stripes);
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
|
||||
ca->buckets_gc = NULL;
|
||||
}
|
||||
for_each_member_device(c, ca)
|
||||
genradix_free(&ca->buckets_gc);
|
||||
}
|
||||
|
||||
static int bch2_gc_start(struct bch_fs *c)
|
||||
@ -910,20 +908,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
|
||||
int ret = 0;
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
|
||||
ca->mi.nbuckets * sizeof(struct bucket),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!buckets) {
|
||||
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
|
||||
if (ret) {
|
||||
bch2_dev_put(ca);
|
||||
ret = -BCH_ERR_ENOMEM_gc_alloc_start;
|
||||
break;
|
||||
}
|
||||
|
||||
buckets->first_bucket = ca->mi.first_bucket;
|
||||
buckets->nbuckets = ca->mi.nbuckets;
|
||||
buckets->nbuckets_minus_first =
|
||||
buckets->nbuckets - buckets->first_bucket;
|
||||
rcu_assign_pointer(ca->buckets_gc, buckets);
|
||||
}
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
|
@ -1010,9 +1010,9 @@ retry_all:
|
||||
* the same position:
|
||||
*/
|
||||
if (trans->paths[idx].uptodate) {
|
||||
__btree_path_get(&trans->paths[idx], false);
|
||||
__btree_path_get(trans, &trans->paths[idx], false);
|
||||
ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
|
||||
__btree_path_put(&trans->paths[idx], false);
|
||||
__btree_path_put(trans, &trans->paths[idx], false);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
|
||||
bch2_err_matches(ret, ENOMEM))
|
||||
@ -1131,6 +1131,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
|
||||
if (unlikely(!trans->srcu_held))
|
||||
bch2_trans_srcu_lock(trans);
|
||||
|
||||
trace_btree_path_traverse_start(trans, path);
|
||||
|
||||
/*
|
||||
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
|
||||
* and re-traverse the path without a transaction restart:
|
||||
@ -1194,6 +1196,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
|
||||
|
||||
out_uptodate:
|
||||
path->uptodate = BTREE_ITER_UPTODATE;
|
||||
trace_btree_path_traverse_end(trans, path);
|
||||
out:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
|
||||
panic("ret %s (%i) trans->restarted %s (%i)\n",
|
||||
@ -1225,7 +1228,7 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i
|
||||
{
|
||||
btree_path_idx_t new = btree_path_alloc(trans, src);
|
||||
btree_path_copy(trans, trans->paths + new, trans->paths + src);
|
||||
__btree_path_get(trans->paths + new, intent);
|
||||
__btree_path_get(trans, trans->paths + new, intent);
|
||||
#ifdef TRACK_PATH_ALLOCATED
|
||||
trans->paths[new].ip_allocated = ip;
|
||||
#endif
|
||||
@ -1236,8 +1239,10 @@ __flatten
|
||||
btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
|
||||
btree_path_idx_t path, bool intent, unsigned long ip)
|
||||
{
|
||||
__btree_path_put(trans->paths + path, intent);
|
||||
struct btree_path *old = trans->paths + path;
|
||||
__btree_path_put(trans, trans->paths + path, intent);
|
||||
path = btree_path_clone(trans, path, intent, ip);
|
||||
trace_btree_path_clone(trans, old, trans->paths + path);
|
||||
trans->paths[path].preserve = false;
|
||||
return path;
|
||||
}
|
||||
@ -1252,6 +1257,8 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
EBUG_ON(!trans->paths[path_idx].ref);
|
||||
|
||||
trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
|
||||
|
||||
path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
|
||||
|
||||
struct btree_path *path = trans->paths + path_idx;
|
||||
@ -1361,13 +1368,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
|
||||
{
|
||||
struct btree_path *path = trans->paths + path_idx, *dup;
|
||||
|
||||
if (!__btree_path_put(path, intent))
|
||||
if (!__btree_path_put(trans, path, intent))
|
||||
return;
|
||||
|
||||
dup = path->preserve
|
||||
? have_path_at_pos(trans, path)
|
||||
: have_node_at_pos(trans, path);
|
||||
|
||||
trace_btree_path_free(trans, path_idx, dup);
|
||||
|
||||
if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
|
||||
return;
|
||||
|
||||
@ -1392,7 +1401,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
|
||||
static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
|
||||
bool intent)
|
||||
{
|
||||
if (!__btree_path_put(trans->paths + path, intent))
|
||||
if (!__btree_path_put(trans, trans->paths + path, intent))
|
||||
return;
|
||||
|
||||
__bch2_path_free(trans, path);
|
||||
@ -1421,8 +1430,8 @@ void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
|
||||
noinline __cold
|
||||
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
|
||||
{
|
||||
prt_printf(buf, "transaction updates for %s journal seq %llu\n",
|
||||
trans->fn, trans->journal_res.seq);
|
||||
prt_printf(buf, "%u transaction updates for %s journal seq %llu\n",
|
||||
trans->nr_updates, trans->fn, trans->journal_res.seq);
|
||||
printbuf_indent_add(buf, 2);
|
||||
|
||||
trans_for_each_update(trans, i) {
|
||||
@ -1464,7 +1473,7 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
|
||||
{
|
||||
struct btree_path *path = trans->paths + path_idx;
|
||||
|
||||
prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ",
|
||||
prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
|
||||
path_idx, path->ref, path->intent_ref,
|
||||
path->preserve ? 'P' : ' ',
|
||||
path->should_be_locked ? 'S' : ' ',
|
||||
@ -1716,14 +1725,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
|
||||
trans->paths[path_pos].cached == cached &&
|
||||
trans->paths[path_pos].btree_id == btree_id &&
|
||||
trans->paths[path_pos].level == level) {
|
||||
__btree_path_get(trans->paths + path_pos, intent);
|
||||
trace_btree_path_get(trans, trans->paths + path_pos, &pos);
|
||||
|
||||
__btree_path_get(trans, trans->paths + path_pos, intent);
|
||||
path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
|
||||
path = trans->paths + path_idx;
|
||||
} else {
|
||||
path_idx = btree_path_alloc(trans, path_pos);
|
||||
path = trans->paths + path_idx;
|
||||
|
||||
__btree_path_get(path, intent);
|
||||
__btree_path_get(trans, path, intent);
|
||||
path->pos = pos;
|
||||
path->btree_id = btree_id;
|
||||
path->cached = cached;
|
||||
@ -1738,6 +1749,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
|
||||
path->ip_allocated = ip;
|
||||
#endif
|
||||
trans->paths_sorted = false;
|
||||
|
||||
trace_btree_path_alloc(trans, path);
|
||||
}
|
||||
|
||||
if (!(flags & BTREE_ITER_nopreserve))
|
||||
@ -1857,7 +1870,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
|
||||
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
if (btree_path_node(path, path->level))
|
||||
btree_path_set_should_be_locked(path);
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1889,7 +1902,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
|
||||
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
|
||||
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
||||
out:
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
bch2_btree_iter_verify(iter);
|
||||
@ -1983,7 +1996,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
||||
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
|
||||
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
||||
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
|
||||
out:
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
@ -2155,7 +2168,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
|
||||
if (unlikely(ret))
|
||||
return bkey_s_c_err(ret);
|
||||
|
||||
btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
|
||||
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
|
||||
|
||||
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
|
||||
if (k.k && !bkey_err(k)) {
|
||||
@ -2199,7 +2212,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
||||
goto out;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(path);
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
|
||||
k = btree_path_level_peek_all(trans->c, l, &iter->k);
|
||||
|
||||
@ -2326,7 +2339,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
|
||||
* advance, same as on exit for iter->path, but only up
|
||||
* to snapshot
|
||||
*/
|
||||
__btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
|
||||
__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
|
||||
iter->update_path = iter->path;
|
||||
|
||||
iter->update_path = bch2_btree_path_set_pos(trans,
|
||||
@ -2382,14 +2395,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
|
||||
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
|
||||
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
||||
out_no_locked:
|
||||
if (iter->update_path) {
|
||||
ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
|
||||
if (unlikely(ret))
|
||||
k = bkey_s_c_err(ret);
|
||||
else
|
||||
btree_path_set_should_be_locked(trans->paths + iter->update_path);
|
||||
btree_path_set_should_be_locked(trans, trans->paths + iter->update_path);
|
||||
}
|
||||
|
||||
if (!(iter->flags & BTREE_ITER_all_snapshots))
|
||||
@ -2511,6 +2524,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
path = btree_iter_path(trans, iter);
|
||||
trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
|
||||
saved_k = *k.k;
|
||||
saved_v = k.v;
|
||||
}
|
||||
@ -2527,7 +2541,7 @@ got_key:
|
||||
continue;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(path);
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
break;
|
||||
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
|
||||
/* Advance to previous leaf node: */
|
||||
@ -2685,7 +2699,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
||||
}
|
||||
}
|
||||
out:
|
||||
btree_path_set_should_be_locked(btree_iter_path(trans, iter));
|
||||
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
|
||||
out_no_locked:
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
bch2_btree_iter_verify(iter);
|
||||
@ -2712,6 +2726,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
|
||||
return bch2_btree_iter_peek_slot(iter);
|
||||
}
|
||||
|
||||
/* Obsolete, but still used by rust wrapper in -tools */
|
||||
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
@ -2911,9 +2926,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
|
||||
dst->ip_allocated = _RET_IP_;
|
||||
#endif
|
||||
if (src->path)
|
||||
__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent);
|
||||
__btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent);
|
||||
if (src->update_path)
|
||||
__btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
|
||||
__btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent);
|
||||
dst->key_cache_path = 0;
|
||||
}
|
||||
|
||||
@ -3237,7 +3252,7 @@ void bch2_trans_put(struct btree_trans *trans)
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
trans_for_each_update(trans, i)
|
||||
__btree_path_put(trans->paths + i->path, true);
|
||||
__btree_path_put(trans, trans->paths + i->path, true);
|
||||
trans->nr_updates = 0;
|
||||
|
||||
check_btree_paths_leaked(trans);
|
||||
|
@ -6,6 +6,12 @@
|
||||
#include "btree_types.h"
|
||||
#include "trace.h"
|
||||
|
||||
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
|
||||
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
|
||||
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
|
||||
void bch2_dump_trans_updates(struct btree_trans *);
|
||||
void bch2_dump_trans_paths_updates(struct btree_trans *);
|
||||
|
||||
static inline int __bkey_err(const struct bkey *k)
|
||||
{
|
||||
return PTR_ERR_OR_ZERO(k);
|
||||
@ -13,16 +19,28 @@ static inline int __bkey_err(const struct bkey *k)
|
||||
|
||||
#define bkey_err(_k) __bkey_err((_k).k)
|
||||
|
||||
static inline void __btree_path_get(struct btree_path *path, bool intent)
|
||||
static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent)
|
||||
{
|
||||
unsigned idx = path - trans->paths;
|
||||
|
||||
EBUG_ON(!test_bit(idx, trans->paths_allocated));
|
||||
if (unlikely(path->ref == U8_MAX)) {
|
||||
bch2_dump_trans_paths_updates(trans);
|
||||
panic("path %u refcount overflow\n", idx);
|
||||
}
|
||||
|
||||
path->ref++;
|
||||
path->intent_ref += intent;
|
||||
trace_btree_path_get_ll(trans, path);
|
||||
}
|
||||
|
||||
static inline bool __btree_path_put(struct btree_path *path, bool intent)
|
||||
static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
|
||||
{
|
||||
EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
|
||||
EBUG_ON(!path->ref);
|
||||
EBUG_ON(!path->intent_ref && intent);
|
||||
|
||||
trace_btree_path_put_ll(trans, path);
|
||||
path->intent_ref -= intent;
|
||||
return --path->ref == 0;
|
||||
}
|
||||
@ -814,20 +832,6 @@ transaction_restart: \
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
||||
|
||||
static inline struct bkey_s_c
|
||||
__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
|
||||
struct btree_iter *iter, unsigned flags)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
|
||||
while (btree_trans_too_many_iters(trans) ||
|
||||
(k = bch2_btree_iter_peek_type(iter, flags),
|
||||
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
|
||||
_start, _end, _flags, _k, _ret) \
|
||||
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
||||
@ -868,7 +872,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
|
||||
\
|
||||
if (bch2_err_matches(_ret, ENOMEM)) { \
|
||||
_gfp = GFP_KERNEL; \
|
||||
_ret = drop_locks_do(trans, _do); \
|
||||
_ret = drop_locks_do(_trans, _do); \
|
||||
} \
|
||||
_ret; \
|
||||
})
|
||||
@ -881,7 +885,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
|
||||
_ret = 0; \
|
||||
if (unlikely(!_p)) { \
|
||||
_gfp = GFP_KERNEL; \
|
||||
_ret = drop_locks_do(trans, ((_p = _do), 0)); \
|
||||
_ret = drop_locks_do(_trans, ((_p = _do), 0)); \
|
||||
} \
|
||||
_p; \
|
||||
})
|
||||
@ -894,12 +898,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
|
||||
_ret; \
|
||||
})
|
||||
|
||||
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
|
||||
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
|
||||
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
|
||||
void bch2_dump_trans_updates(struct btree_trans *);
|
||||
void bch2_dump_trans_paths_updates(struct btree_trans *);
|
||||
|
||||
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
|
||||
void bch2_trans_put(struct btree_trans *);
|
||||
|
||||
|
@ -79,130 +79,41 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void bkey_cached_evict(struct btree_key_cache *c,
|
||||
static bool bkey_cached_evict(struct btree_key_cache *c,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
|
||||
bch2_btree_key_cache_params));
|
||||
bool ret = !rhashtable_remove_fast(&c->table, &ck->hash,
|
||||
bch2_btree_key_cache_params);
|
||||
if (ret) {
|
||||
memset(&ck->key, ~0, sizeof(ck->key));
|
||||
|
||||
atomic_long_dec(&c->nr_keys);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu)
|
||||
{
|
||||
struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier);
|
||||
struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu);
|
||||
|
||||
this_cpu_dec(*c->btree_key_cache.nr_pending);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
}
|
||||
|
||||
static void bkey_cached_free(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
|
||||
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
|
||||
|
||||
ck->btree_trans_barrier_seq =
|
||||
start_poll_synchronize_srcu(&c->btree_trans_barrier);
|
||||
|
||||
if (ck->c.lock.readers) {
|
||||
list_move_tail(&ck->list, &bc->freed_pcpu);
|
||||
bc->nr_freed_pcpu++;
|
||||
} else {
|
||||
list_move_tail(&ck->list, &bc->freed_nonpcpu);
|
||||
bc->nr_freed_nonpcpu++;
|
||||
}
|
||||
atomic_long_inc(&bc->nr_freed);
|
||||
|
||||
kfree(ck->k);
|
||||
ck->k = NULL;
|
||||
ck->u64s = 0;
|
||||
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
struct bkey_cached *pos;
|
||||
|
||||
bc->nr_freed_nonpcpu++;
|
||||
|
||||
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
|
||||
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
|
||||
pos->btree_trans_barrier_seq)) {
|
||||
list_move(&ck->list, &pos->list);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
list_move(&ck->list, &bc->freed_nonpcpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
|
||||
|
||||
if (!ck->c.lock.readers) {
|
||||
#ifdef __KERNEL__
|
||||
struct btree_key_cache_freelist *f;
|
||||
bool freed = false;
|
||||
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
|
||||
if (f->nr < ARRAY_SIZE(f->objs)) {
|
||||
f->objs[f->nr++] = ck;
|
||||
freed = true;
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
if (!freed) {
|
||||
mutex_lock(&bc->lock);
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
|
||||
while (f->nr > ARRAY_SIZE(f->objs) / 2) {
|
||||
struct bkey_cached *ck2 = f->objs[--f->nr];
|
||||
|
||||
__bkey_cached_move_to_freelist_ordered(bc, ck2);
|
||||
}
|
||||
preempt_enable();
|
||||
|
||||
__bkey_cached_move_to_freelist_ordered(bc, ck);
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
#else
|
||||
mutex_lock(&bc->lock);
|
||||
list_move_tail(&ck->list, &bc->freed_nonpcpu);
|
||||
bc->nr_freed_nonpcpu++;
|
||||
mutex_unlock(&bc->lock);
|
||||
#endif
|
||||
} else {
|
||||
mutex_lock(&bc->lock);
|
||||
list_move_tail(&ck->list, &bc->freed_pcpu);
|
||||
bc->nr_freed_pcpu++;
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static void bkey_cached_free_fast(struct btree_key_cache *bc,
|
||||
struct bkey_cached *ck)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
|
||||
ck->btree_trans_barrier_seq =
|
||||
start_poll_synchronize_srcu(&c->btree_trans_barrier);
|
||||
|
||||
list_del_init(&ck->list);
|
||||
atomic_long_inc(&bc->nr_freed);
|
||||
|
||||
kfree(ck->k);
|
||||
ck->k = NULL;
|
||||
ck->u64s = 0;
|
||||
|
||||
bkey_cached_move_to_freelist(bc, ck);
|
||||
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
bool pcpu_readers = ck->c.lock.readers != NULL;
|
||||
rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu);
|
||||
this_cpu_inc(*bc->nr_pending);
|
||||
}
|
||||
|
||||
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
|
||||
@ -224,74 +135,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_key_cache *bc = &c->btree_key_cache;
|
||||
struct bkey_cached *ck = NULL;
|
||||
bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
|
||||
int ret;
|
||||
|
||||
if (!pcpu_readers) {
|
||||
#ifdef __KERNEL__
|
||||
struct btree_key_cache_freelist *f;
|
||||
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
if (f->nr)
|
||||
ck = f->objs[--f->nr];
|
||||
preempt_enable();
|
||||
|
||||
if (!ck) {
|
||||
mutex_lock(&bc->lock);
|
||||
preempt_disable();
|
||||
f = this_cpu_ptr(bc->pcpu_freed);
|
||||
|
||||
while (!list_empty(&bc->freed_nonpcpu) &&
|
||||
f->nr < ARRAY_SIZE(f->objs) / 2) {
|
||||
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
|
||||
list_del_init(&ck->list);
|
||||
bc->nr_freed_nonpcpu--;
|
||||
f->objs[f->nr++] = ck;
|
||||
}
|
||||
|
||||
ck = f->nr ? f->objs[--f->nr] : NULL;
|
||||
preempt_enable();
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
#else
|
||||
mutex_lock(&bc->lock);
|
||||
if (!list_empty(&bc->freed_nonpcpu)) {
|
||||
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
|
||||
list_del_init(&ck->list);
|
||||
bc->nr_freed_nonpcpu--;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
#endif
|
||||
} else {
|
||||
mutex_lock(&bc->lock);
|
||||
if (!list_empty(&bc->freed_pcpu)) {
|
||||
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
|
||||
list_del_init(&ck->list);
|
||||
bc->nr_freed_pcpu--;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
}
|
||||
|
||||
if (ck) {
|
||||
ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
|
||||
if (unlikely(ret)) {
|
||||
bkey_cached_move_to_freelist(bc, ck);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED);
|
||||
|
||||
ret = bch2_btree_node_lock_write(trans, path, &ck->c);
|
||||
if (unlikely(ret)) {
|
||||
btree_node_unlock(trans, path, 0);
|
||||
bkey_cached_move_to_freelist(bc, ck);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
return ck;
|
||||
}
|
||||
struct bkey_cached *ck = container_of_or_null(
|
||||
rcu_pending_dequeue(&bc->pending[pcpu_readers]),
|
||||
struct bkey_cached, rcu);
|
||||
if (ck)
|
||||
goto lock;
|
||||
|
||||
ck = allocate_dropping_locks(trans, ret,
|
||||
__bkey_cached_alloc(key_u64s, _gfp));
|
||||
@ -302,15 +153,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
if (!ck)
|
||||
return NULL;
|
||||
|
||||
INIT_LIST_HEAD(&ck->list);
|
||||
if (ck) {
|
||||
bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
|
||||
|
||||
ck->c.cached = true;
|
||||
BUG_ON(!six_trylock_intent(&ck->c.lock));
|
||||
BUG_ON(!six_trylock_write(&ck->c.lock));
|
||||
goto lock;
|
||||
}
|
||||
|
||||
ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]),
|
||||
struct bkey_cached, rcu);
|
||||
if (ck)
|
||||
goto lock;
|
||||
lock:
|
||||
six_lock_intent(&ck->c.lock, NULL, NULL);
|
||||
six_lock_write(&ck->c.lock, NULL, NULL);
|
||||
return ck;
|
||||
}
|
||||
|
||||
@ -322,21 +177,21 @@ bkey_cached_reuse(struct btree_key_cache *c)
|
||||
struct bkey_cached *ck;
|
||||
unsigned i;
|
||||
|
||||
mutex_lock(&c->lock);
|
||||
rcu_read_lock();
|
||||
tbl = rht_dereference_rcu(c->table.tbl, &c->table);
|
||||
for (i = 0; i < tbl->size; i++)
|
||||
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
|
||||
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
|
||||
bkey_cached_lock_for_evict(ck)) {
|
||||
bkey_cached_evict(c, ck);
|
||||
if (bkey_cached_evict(c, ck))
|
||||
goto out;
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
}
|
||||
ck = NULL;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
mutex_unlock(&c->lock);
|
||||
return ck;
|
||||
}
|
||||
|
||||
@ -415,7 +270,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
|
||||
path->uptodate = BTREE_ITER_UPTODATE;
|
||||
return 0;
|
||||
err:
|
||||
bkey_cached_free_fast(bc, ck);
|
||||
bkey_cached_free(bc, ck);
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
|
||||
return ret;
|
||||
@ -611,8 +466,12 @@ evict:
|
||||
}
|
||||
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
bkey_cached_evict(&c->btree_key_cache, ck);
|
||||
bkey_cached_free_fast(&c->btree_key_cache, ck);
|
||||
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
|
||||
bkey_cached_free(&c->btree_key_cache, ck);
|
||||
} else {
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
}
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &b_iter);
|
||||
@ -722,7 +581,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
bkey_cached_evict(bc, ck);
|
||||
bkey_cached_free_fast(bc, ck);
|
||||
bkey_cached_free(bc, ck);
|
||||
|
||||
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
|
||||
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
|
||||
@ -735,48 +594,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
struct bch_fs *c = shrink->private_data;
|
||||
struct btree_key_cache *bc = &c->btree_key_cache;
|
||||
struct bucket_table *tbl;
|
||||
struct bkey_cached *ck, *t;
|
||||
struct bkey_cached *ck;
|
||||
size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
|
||||
unsigned start, flags;
|
||||
unsigned iter, start;
|
||||
int srcu_idx;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
bc->requested_to_free += sc->nr_to_scan;
|
||||
|
||||
srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
||||
flags = memalloc_nofs_save();
|
||||
|
||||
/*
|
||||
* Newest freed entries are at the end of the list - once we hit one
|
||||
* that's too new to be freed, we can bail out:
|
||||
*/
|
||||
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
|
||||
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
|
||||
ck->btree_trans_barrier_seq))
|
||||
break;
|
||||
|
||||
list_del(&ck->list);
|
||||
six_lock_exit(&ck->c.lock);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
atomic_long_dec(&bc->nr_freed);
|
||||
bc->nr_freed_nonpcpu--;
|
||||
bc->freed++;
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
|
||||
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
|
||||
ck->btree_trans_barrier_seq))
|
||||
break;
|
||||
|
||||
list_del(&ck->list);
|
||||
six_lock_exit(&ck->c.lock);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
atomic_long_dec(&bc->nr_freed);
|
||||
bc->nr_freed_pcpu--;
|
||||
bc->freed++;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
|
||||
|
||||
/*
|
||||
@ -792,17 +617,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
return SHRINK_STOP;
|
||||
}
|
||||
|
||||
if (bc->shrink_iter >= tbl->size)
|
||||
bc->shrink_iter = 0;
|
||||
start = bc->shrink_iter;
|
||||
iter = bc->shrink_iter;
|
||||
if (iter >= tbl->size)
|
||||
iter = 0;
|
||||
start = iter;
|
||||
|
||||
do {
|
||||
struct rhash_head *pos, *next;
|
||||
|
||||
pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
|
||||
pos = rht_ptr_rcu(&tbl->buckets[iter]);
|
||||
|
||||
while (!rht_is_a_nulls(pos)) {
|
||||
next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
|
||||
next = rht_dereference_bucket_rcu(pos->next, tbl, iter);
|
||||
ck = container_of(pos, struct bkey_cached, hash);
|
||||
|
||||
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
||||
@ -812,29 +638,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
bc->skipped_accessed++;
|
||||
} else if (!bkey_cached_lock_for_evict(ck)) {
|
||||
bc->skipped_lock_fail++;
|
||||
} else {
|
||||
bkey_cached_evict(bc, ck);
|
||||
} else if (bkey_cached_evict(bc, ck)) {
|
||||
bkey_cached_free(bc, ck);
|
||||
bc->moved_to_freelist++;
|
||||
bc->freed++;
|
||||
freed++;
|
||||
} else {
|
||||
six_unlock_write(&ck->c.lock);
|
||||
six_unlock_intent(&ck->c.lock);
|
||||
}
|
||||
|
||||
scanned++;
|
||||
if (scanned >= nr)
|
||||
break;
|
||||
goto out;
|
||||
|
||||
pos = next;
|
||||
}
|
||||
|
||||
bc->shrink_iter++;
|
||||
if (bc->shrink_iter >= tbl->size)
|
||||
bc->shrink_iter = 0;
|
||||
} while (scanned < nr && bc->shrink_iter != start);
|
||||
iter++;
|
||||
if (iter >= tbl->size)
|
||||
iter = 0;
|
||||
} while (scanned < nr && iter != start);
|
||||
out:
|
||||
bc->shrink_iter = iter;
|
||||
|
||||
rcu_read_unlock();
|
||||
memalloc_nofs_restore(flags);
|
||||
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
return freed;
|
||||
}
|
||||
@ -862,18 +690,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
struct bucket_table *tbl;
|
||||
struct bkey_cached *ck, *n;
|
||||
struct bkey_cached *ck;
|
||||
struct rhash_head *pos;
|
||||
LIST_HEAD(items);
|
||||
unsigned i;
|
||||
#ifdef __KERNEL__
|
||||
int cpu;
|
||||
#endif
|
||||
|
||||
shrinker_free(bc->shrink);
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
/*
|
||||
* The loop is needed to guard against racing with rehash:
|
||||
*/
|
||||
@ -892,44 +715,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
||||
for (i = 0; i < tbl->size; i++)
|
||||
while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
|
||||
ck = container_of(pos, struct bkey_cached, hash);
|
||||
bkey_cached_evict(bc, ck);
|
||||
list_add(&ck->list, &items);
|
||||
BUG_ON(!bkey_cached_evict(bc, ck));
|
||||
kfree(ck->k);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
if (bc->pcpu_freed) {
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct btree_key_cache_freelist *f =
|
||||
per_cpu_ptr(bc->pcpu_freed, cpu);
|
||||
|
||||
for (i = 0; i < f->nr; i++) {
|
||||
ck = f->objs[i];
|
||||
list_add(&ck->list, &items);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
|
||||
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
|
||||
|
||||
list_splice(&bc->freed_pcpu, &items);
|
||||
list_splice(&bc->freed_nonpcpu, &items);
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
list_for_each_entry_safe(ck, n, &items, list) {
|
||||
cond_resched();
|
||||
|
||||
list_del(&ck->list);
|
||||
kfree(ck->k);
|
||||
six_lock_exit(&ck->c.lock);
|
||||
kmem_cache_free(bch2_key_cache, ck);
|
||||
}
|
||||
|
||||
if (atomic_long_read(&bc->nr_dirty) &&
|
||||
!bch2_journal_error(&c->journal) &&
|
||||
test_bit(BCH_FS_was_rw, &c->flags))
|
||||
@ -943,14 +736,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
||||
if (bc->table_init_done)
|
||||
rhashtable_destroy(&bc->table);
|
||||
|
||||
free_percpu(bc->pcpu_freed);
|
||||
rcu_pending_exit(&bc->pending[0]);
|
||||
rcu_pending_exit(&bc->pending[1]);
|
||||
|
||||
free_percpu(bc->nr_pending);
|
||||
}
|
||||
|
||||
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
|
||||
{
|
||||
mutex_init(&c->lock);
|
||||
INIT_LIST_HEAD(&c->freed_pcpu);
|
||||
INIT_LIST_HEAD(&c->freed_nonpcpu);
|
||||
}
|
||||
|
||||
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
||||
@ -958,11 +751,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
struct shrinker *shrink;
|
||||
|
||||
#ifdef __KERNEL__
|
||||
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
|
||||
if (!bc->pcpu_freed)
|
||||
bc->nr_pending = alloc_percpu(size_t);
|
||||
if (!bc->nr_pending)
|
||||
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
||||
|
||||
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
|
||||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
|
||||
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
||||
#endif
|
||||
|
||||
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
|
||||
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
||||
@ -984,45 +779,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
|
||||
|
||||
void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc)
|
||||
{
|
||||
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
||||
|
||||
printbuf_tabstop_push(out, 24);
|
||||
printbuf_tabstop_push(out, 12);
|
||||
|
||||
unsigned flags = memalloc_nofs_save();
|
||||
mutex_lock(&bc->lock);
|
||||
prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys));
|
||||
prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty));
|
||||
prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed));
|
||||
prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu);
|
||||
prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu);
|
||||
|
||||
prt_printf(out, "\nshrinker:\n");
|
||||
prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "shrinker:\n");
|
||||
prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free);
|
||||
prt_printf(out, "freed:\t%lu\r\n", bc->freed);
|
||||
prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist);
|
||||
prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty);
|
||||
prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed);
|
||||
prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail);
|
||||
|
||||
prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier));
|
||||
|
||||
struct bkey_cached *ck;
|
||||
unsigned iter = 0;
|
||||
list_for_each_entry(ck, &bc->freed_nonpcpu, list) {
|
||||
prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
|
||||
if (++iter > 10)
|
||||
break;
|
||||
}
|
||||
|
||||
iter = 0;
|
||||
list_for_each_entry(ck, &bc->freed_pcpu, list) {
|
||||
prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq);
|
||||
if (++iter > 10)
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&bc->lock);
|
||||
memalloc_flags_restore(flags);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending));
|
||||
}
|
||||
|
||||
void bch2_btree_key_cache_exit(void)
|
||||
|
@ -2,33 +2,25 @@
|
||||
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
|
||||
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
|
||||
|
||||
struct btree_key_cache_freelist {
|
||||
struct bkey_cached *objs[16];
|
||||
unsigned nr;
|
||||
};
|
||||
#include "rcu_pending.h"
|
||||
|
||||
struct btree_key_cache {
|
||||
struct mutex lock;
|
||||
struct rhashtable table;
|
||||
bool table_init_done;
|
||||
|
||||
struct list_head freed_pcpu;
|
||||
size_t nr_freed_pcpu;
|
||||
struct list_head freed_nonpcpu;
|
||||
size_t nr_freed_nonpcpu;
|
||||
|
||||
struct shrinker *shrink;
|
||||
unsigned shrink_iter;
|
||||
struct btree_key_cache_freelist __percpu *pcpu_freed;
|
||||
|
||||
atomic_long_t nr_freed;
|
||||
/* 0: non pcpu reader locks, 1: pcpu reader locks */
|
||||
struct rcu_pending pending[2];
|
||||
size_t __percpu *nr_pending;
|
||||
|
||||
atomic_long_t nr_keys;
|
||||
atomic_long_t nr_dirty;
|
||||
|
||||
/* shrinker stats */
|
||||
unsigned long requested_to_free;
|
||||
unsigned long freed;
|
||||
unsigned long moved_to_freelist;
|
||||
unsigned long skipped_dirty;
|
||||
unsigned long skipped_accessed;
|
||||
unsigned long skipped_lock_fail;
|
||||
|
@ -218,16 +218,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
|
||||
bool lock_may_not_fail,
|
||||
unsigned long ip)
|
||||
{
|
||||
int ret;
|
||||
|
||||
trans->lock_may_not_fail = lock_may_not_fail;
|
||||
trans->lock_must_abort = false;
|
||||
trans->locking = b;
|
||||
|
||||
ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
|
||||
int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
|
||||
bch2_six_check_for_deadlock, trans, ip);
|
||||
WRITE_ONCE(trans->locking, NULL);
|
||||
WRITE_ONCE(trans->locking_wait.start_time, 0);
|
||||
|
||||
if (!ret)
|
||||
trace_btree_path_lock(trans, _THIS_IP_, b);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -281,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
|
||||
int ret = 0;
|
||||
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
|
||||
if (likely(six_trylock_type(&b->lock, type)) ||
|
||||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
|
||||
@ -400,12 +402,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
|
||||
|
||||
/* misc: */
|
||||
|
||||
static inline void btree_path_set_should_be_locked(struct btree_path *path)
|
||||
static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path)
|
||||
{
|
||||
EBUG_ON(!btree_node_locked(path, path->level));
|
||||
EBUG_ON(path->uptodate);
|
||||
|
||||
path->should_be_locked = true;
|
||||
trace_btree_path_should_be_locked(trans, path);
|
||||
}
|
||||
|
||||
static inline void __btree_path_set_level_up(struct btree_trans *trans,
|
||||
|
@ -214,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
|
||||
|
||||
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
|
||||
overwrite:
|
||||
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
|
||||
bch2_bset_insert(b, k, insert, clobber_u64s);
|
||||
new_u64s = k->u64s;
|
||||
fix_iter:
|
||||
if (clobber_u64s != new_u64s)
|
||||
|
@ -386,17 +386,16 @@ struct bkey_cached {
|
||||
struct btree_bkey_cached_common c;
|
||||
|
||||
unsigned long flags;
|
||||
unsigned long btree_trans_barrier_seq;
|
||||
u16 u64s;
|
||||
struct bkey_cached_key key;
|
||||
|
||||
struct rhash_head hash;
|
||||
struct list_head list;
|
||||
|
||||
struct journal_entry_pin journal;
|
||||
u64 seq;
|
||||
|
||||
struct bkey_i *k;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
|
||||
|
@ -374,7 +374,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
|
||||
i->key_cache_already_flushed = true;
|
||||
i->flags |= BTREE_TRIGGER_norun;
|
||||
|
||||
btree_path_set_should_be_locked(btree_path);
|
||||
btree_path_set_should_be_locked(trans, btree_path);
|
||||
ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
|
||||
out:
|
||||
bch2_path_put(trans, path_idx, true);
|
||||
@ -422,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
|
||||
break;
|
||||
}
|
||||
|
||||
if (!cmp && i < trans->updates + trans->nr_updates) {
|
||||
bool overwrite = !cmp && i < trans->updates + trans->nr_updates;
|
||||
|
||||
if (overwrite) {
|
||||
EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
|
||||
|
||||
bch2_path_put(trans, i->path, true);
|
||||
@ -449,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
|
||||
}
|
||||
}
|
||||
|
||||
__btree_path_get(trans->paths + i->path, true);
|
||||
__btree_path_get(trans, trans->paths + i->path, true);
|
||||
|
||||
trace_update_by_path(trans, path, i, overwrite);
|
||||
|
||||
/*
|
||||
* If a key is present in the key cache, it must also exist in the
|
||||
@ -498,7 +502,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
|
||||
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
|
||||
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -731,6 +731,18 @@ static void btree_update_nodes_written(struct btree_update *as)
|
||||
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
|
||||
"%s", bch2_err_str(ret));
|
||||
err:
|
||||
/*
|
||||
* Ensure transaction is unlocked before using btree_node_lock_nopath()
|
||||
* (the use of which is always suspect, we need to work on removing this
|
||||
* in the future)
|
||||
*
|
||||
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
|
||||
* calls bch2_path_upgrade(), before we call path_make_mut(), so we may
|
||||
* rarely end up with a locked path besides the one we have here:
|
||||
*/
|
||||
bch2_trans_unlock(trans);
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
/*
|
||||
* We have to be careful because another thread might be getting ready
|
||||
* to free as->b and calling btree_update_reparent() on us - we'll
|
||||
@ -750,18 +762,6 @@ err:
|
||||
* we're in journal error state:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Ensure transaction is unlocked before using
|
||||
* btree_node_lock_nopath() (the use of which is always suspect,
|
||||
* we need to work on removing this in the future)
|
||||
*
|
||||
* It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get()
|
||||
* calls bch2_path_upgrade(), before we call path_make_mut(), so
|
||||
* we may rarely end up with a locked path besides the one we
|
||||
* have here:
|
||||
*/
|
||||
bch2_trans_unlock(trans);
|
||||
bch2_trans_begin(trans);
|
||||
btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans,
|
||||
as->btree_id, b->c.level, b->key.k.p);
|
||||
struct btree_path *path = trans->paths + path_idx;
|
||||
@ -1981,7 +1981,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
btree_path_set_should_be_locked(trans->paths + sib_path);
|
||||
btree_path_set_should_be_locked(trans, trans->paths + sib_path);
|
||||
|
||||
m = trans->paths[sib_path].l[level].b;
|
||||
|
||||
|
@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
unsigned level,
|
||||
unsigned flags)
|
||||
{
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
|
||||
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
|
||||
btree_prev_sib) ?:
|
||||
bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
|
||||
|
@ -740,7 +740,7 @@ static int __trigger_extent(struct btree_trans *trans,
|
||||
return ret;
|
||||
} else if (!p.has_ec) {
|
||||
*replicas_sectors += disk_sectors;
|
||||
acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
|
||||
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
|
||||
} else {
|
||||
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
|
||||
if (ret)
|
||||
|
@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b)
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
|
||||
static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
|
||||
{
|
||||
return rcu_dereference_check(ca->buckets_gc,
|
||||
!ca->fs ||
|
||||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
|
||||
lockdep_is_held(&ca->fs->state_lock) ||
|
||||
lockdep_is_held(&ca->bucket_lock));
|
||||
}
|
||||
|
||||
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
|
||||
{
|
||||
struct bucket_array *buckets = gc_bucket_array(ca);
|
||||
|
||||
if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
|
||||
return NULL;
|
||||
return buckets->b + b;
|
||||
return genradix_ptr(&ca->buckets_gc, b);
|
||||
}
|
||||
|
||||
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
|
||||
|
@ -19,14 +19,6 @@ struct bucket {
|
||||
u32 stripe_sectors;
|
||||
} __aligned(sizeof(long));
|
||||
|
||||
struct bucket_array {
|
||||
struct rcu_head rcu;
|
||||
u16 first_bucket;
|
||||
size_t nbuckets;
|
||||
size_t nbuckets_minus_first;
|
||||
struct bucket b[];
|
||||
};
|
||||
|
||||
struct bucket_gens {
|
||||
struct rcu_head rcu;
|
||||
u16 first_bucket;
|
||||
|
@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors,
|
||||
|
||||
void bch2_io_clock_schedule_timeout(struct io_clock *, u64);
|
||||
|
||||
#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
|
||||
({ \
|
||||
long __ret = timeout; \
|
||||
might_sleep(); \
|
||||
if (!___wait_cond_timeout(condition)) \
|
||||
__ret = __wait_event_timeout(wq, condition, timeout); \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
|
||||
|
||||
void bch2_io_clock_exit(struct io_clock *);
|
||||
|
@ -4,12 +4,12 @@
|
||||
#include <linux/slab.h>
|
||||
#include "darray.h"
|
||||
|
||||
int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
|
||||
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
|
||||
{
|
||||
if (new_size > d->size) {
|
||||
new_size = roundup_pow_of_two(new_size);
|
||||
|
||||
void *data = kvmalloc_array(new_size, element_size, gfp);
|
||||
void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
|
||||
|
@ -22,29 +22,23 @@ struct { \
|
||||
typedef DARRAY(char) darray_char;
|
||||
typedef DARRAY(char *) darray_str;
|
||||
|
||||
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
|
||||
int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
|
||||
|
||||
static inline int __darray_resize(darray_char *d, size_t element_size,
|
||||
size_t new_size, gfp_t gfp)
|
||||
{
|
||||
return unlikely(new_size > d->size)
|
||||
? __bch2_darray_resize(d, element_size, new_size, gfp)
|
||||
: 0;
|
||||
}
|
||||
#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__))
|
||||
|
||||
#define __darray_resize(_d, _element_size, _new_size, _gfp) \
|
||||
(unlikely((_new_size) > (_d)->size) \
|
||||
? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\
|
||||
: 0)
|
||||
|
||||
#define darray_resize_gfp(_d, _new_size, _gfp) \
|
||||
unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp))
|
||||
__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
|
||||
|
||||
#define darray_resize(_d, _new_size) \
|
||||
darray_resize_gfp(_d, _new_size, GFP_KERNEL)
|
||||
|
||||
static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp)
|
||||
{
|
||||
return __darray_resize(d, t_size, d->nr + more, gfp);
|
||||
}
|
||||
|
||||
#define darray_make_room_gfp(_d, _more, _gfp) \
|
||||
__darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp)
|
||||
darray_resize_gfp((_d), (_d)->nr + (_more), _gfp)
|
||||
|
||||
#define darray_make_room(_d, _more) \
|
||||
darray_make_room_gfp(_d, _more, GFP_KERNEL)
|
||||
|
@ -337,6 +337,7 @@ restart_drop_extra_replicas:
|
||||
printbuf_exit(&buf);
|
||||
|
||||
bch2_fatal_error(c);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -552,25 +552,14 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
|
||||
|
||||
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
|
||||
{
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
subvol_inum target;
|
||||
u32 snapshot;
|
||||
struct bkey_buf sk;
|
||||
int ret;
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
|
||||
SPOS(inum.inum, ctx->pos, snapshot),
|
||||
POS(inum.inum, U64_MAX), 0, k, ret) {
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
|
||||
POS(inum.inum, ctx->pos),
|
||||
POS(inum.inum, U64_MAX),
|
||||
inum.subvol, 0, k, ({
|
||||
if (k.k->type != KEY_TYPE_dirent)
|
||||
continue;
|
||||
|
||||
@ -578,36 +567,15 @@ retry:
|
||||
bch2_bkey_buf_reassemble(&sk, c, k);
|
||||
struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
|
||||
|
||||
ret = bch2_dirent_read_target(trans, inum, dirent, &target);
|
||||
if (ret < 0)
|
||||
break;
|
||||
if (ret)
|
||||
subvol_inum target;
|
||||
int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
|
||||
if (ret2 > 0)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* read_target looks up subvolumes, we can overflow paths if the
|
||||
* directory has many subvolumes in it
|
||||
*
|
||||
* XXX: btree_trans_too_many_iters() is something we'd like to
|
||||
* get rid of, and there's no good reason to be using it here
|
||||
* except that we don't yet have a for_each_btree_key() helper
|
||||
* that does subvolume_get_snapshot().
|
||||
*/
|
||||
ret = drop_locks_do(trans,
|
||||
bch2_dir_emit(ctx, dirent, target)) ?:
|
||||
btree_trans_too_many_iters(trans);
|
||||
if (ret) {
|
||||
ret = ret < 0 ? ret : 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
|
||||
})));
|
||||
|
||||
bch2_trans_put(trans);
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
|
||||
return ret;
|
||||
return ret < 0 ? ret : 0;
|
||||
}
|
||||
|
@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
|
||||
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
|
||||
if (p1.ptr.dev == p2.ptr.dev &&
|
||||
p1.ptr.gen == p2.ptr.gen &&
|
||||
|
||||
/*
|
||||
* This checks that the two pointers point
|
||||
* to the same region on disk - adjusting
|
||||
* for the difference in where the extents
|
||||
* start, since one may have been trimmed:
|
||||
*/
|
||||
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
|
||||
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
|
||||
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
|
||||
|
||||
/*
|
||||
* This additionally checks that the
|
||||
* extents overlap on disk, since the
|
||||
* previous check may trigger spuriously
|
||||
* when one extent is immediately partially
|
||||
* overwritten with another extent (so that
|
||||
* on disk they are adjacent) and
|
||||
* compression is in use:
|
||||
*/
|
||||
((p1.ptr.offset >= p2.ptr.offset &&
|
||||
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
|
||||
(p2.ptr.offset >= p1.ptr.offset &&
|
||||
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -357,7 +357,7 @@ out: \
|
||||
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
|
||||
_ptr, _entry)
|
||||
|
||||
#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
|
||||
#define bkey_crc_next(_k, _end, _crc, _iter) \
|
||||
({ \
|
||||
__bkey_extent_entry_for_each_from(_iter, _end, _iter) \
|
||||
if (extent_entry_is_crc(_iter)) { \
|
||||
@ -372,7 +372,7 @@ out: \
|
||||
#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
|
||||
for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
|
||||
(_iter) = (_start); \
|
||||
bkey_crc_next(_k, _start, _end, _crc, _iter); \
|
||||
bkey_crc_next(_k, _end, _crc, _iter); \
|
||||
(_iter) = extent_entry_next(_iter))
|
||||
|
||||
#define bkey_for_each_crc(_k, _p, _crc, _iter) \
|
||||
|
@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans,
|
||||
struct bkey_buf sk;
|
||||
int flags = BCH_READ_RETRY_IF_STALE|
|
||||
BCH_READ_MAY_PROMOTE;
|
||||
u32 snapshot;
|
||||
int ret = 0;
|
||||
|
||||
rbio->c = c;
|
||||
@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans,
|
||||
rbio->subvol = inum.subvol;
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
iter = (struct btree_iter) { NULL };
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
||||
SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
|
||||
POS(inum.inum, rbio->bio.bi_iter.bi_sector),
|
||||
BTREE_ITER_slots);
|
||||
while (1) {
|
||||
struct bkey_s_c k;
|
||||
unsigned bytes, sectors, offset_into_extent;
|
||||
enum btree_id data_btree = BTREE_ID_extents;
|
||||
|
||||
/*
|
||||
* read_extent -> io_time_reset may cause a transaction restart
|
||||
* without returning an error, we need to check for that here:
|
||||
*/
|
||||
ret = bch2_trans_relock(trans);
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
u32 snapshot;
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
break;
|
||||
goto err;
|
||||
|
||||
bch2_btree_iter_set_snapshot(&iter, snapshot);
|
||||
|
||||
bch2_btree_iter_set_pos(&iter,
|
||||
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
|
||||
@ -189,7 +182,7 @@ retry:
|
||||
k = bch2_btree_iter_peek_slot(&iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
break;
|
||||
goto err;
|
||||
|
||||
offset_into_extent = iter.pos.offset -
|
||||
bkey_start_offset(k.k);
|
||||
@ -200,7 +193,7 @@ retry:
|
||||
ret = bch2_read_indirect_extent(trans, &data_btree,
|
||||
&offset_into_extent, &sk);
|
||||
if (ret)
|
||||
break;
|
||||
goto err;
|
||||
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
@ -210,7 +203,7 @@ retry:
|
||||
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
|
||||
extent_partial_reads_expensive(k));
|
||||
if (ret)
|
||||
break;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
|
||||
@ -229,17 +222,13 @@ retry:
|
||||
|
||||
swap(rbio->bio.bi_iter.bi_size, bytes);
|
||||
bio_advance(&rbio->bio, bytes);
|
||||
|
||||
ret = btree_trans_too_many_iters(trans);
|
||||
if (ret)
|
||||
err:
|
||||
if (ret &&
|
||||
!bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
break;
|
||||
}
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
if (ret) {
|
||||
bch_err_inum_offset_ratelimited(c,
|
||||
iter.pos.inode,
|
||||
@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
|
||||
op->nr_replicas = nr_replicas;
|
||||
op->res.nr_replicas = nr_replicas;
|
||||
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
|
||||
op->subvol = inode->ei_subvol;
|
||||
op->subvol = inode->ei_inum.subvol;
|
||||
op->pos = POS(inode->v.i_ino, sector);
|
||||
op->end_io = bch2_writepage_io_done;
|
||||
op->devs_need_flush = &inode->ei_devs_need_flush;
|
||||
|
@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
|
||||
dio->op.target = dio->op.opts.foreground_target;
|
||||
dio->op.write_point = writepoint_hashed((unsigned long) current);
|
||||
dio->op.nr_replicas = dio->op.opts.data_replicas;
|
||||
dio->op.subvol = inode->ei_subvol;
|
||||
dio->op.subvol = inode->ei_inum.subvol;
|
||||
dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
|
||||
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
|
||||
|
||||
|
@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio,
|
||||
int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
|
||||
struct folio **fs, unsigned nr_folios)
|
||||
{
|
||||
struct btree_trans *trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bch_folio *s;
|
||||
u64 offset = folio_sector(fs[0]);
|
||||
unsigned folio_idx;
|
||||
u32 snapshot;
|
||||
bool need_set = false;
|
||||
int ret;
|
||||
|
||||
for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
|
||||
s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
|
||||
for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
|
||||
struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -203,18 +196,13 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
|
||||
if (!need_set)
|
||||
return 0;
|
||||
|
||||
folio_idx = 0;
|
||||
trans = bch2_trans_get(c);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
unsigned folio_idx = 0;
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
|
||||
SPOS(inum.inum, offset, snapshot),
|
||||
BTREE_ITER_slots, k, ret) {
|
||||
return bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
|
||||
POS(inum.inum, offset),
|
||||
POS(inum.inum, U64_MAX),
|
||||
inum.subvol, BTREE_ITER_slots, k, ({
|
||||
unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
|
||||
unsigned state = bkey_to_sector_state(k);
|
||||
|
||||
@ -240,16 +228,8 @@ retry:
|
||||
|
||||
if (folio_idx == nr_folios)
|
||||
break;
|
||||
}
|
||||
|
||||
offset = iter.pos.offset;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
bch2_trans_put(trans);
|
||||
|
||||
return ret;
|
||||
0;
|
||||
})));
|
||||
}
|
||||
|
||||
void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
|
||||
|
@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio)
|
||||
|
||||
static inline struct bch_folio *__bch2_folio(struct folio *folio)
|
||||
{
|
||||
return folio_has_private(folio)
|
||||
? (struct bch_folio *) folio_get_private(folio)
|
||||
: NULL;
|
||||
return folio_get_private(folio);
|
||||
}
|
||||
|
||||
static inline struct bch_folio *bch2_folio(struct folio *folio)
|
||||
|
@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
|
||||
struct bpos start,
|
||||
struct bpos end)
|
||||
{
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
|
||||
if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
start = iter.pos;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
bch2_trans_put(trans);
|
||||
return ret;
|
||||
return bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end,
|
||||
subvol, 0, k, ({
|
||||
bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k);
|
||||
})));
|
||||
}
|
||||
|
||||
static int __bch2_truncate_folio(struct bch_inode_info *inode,
|
||||
@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
|
||||
* XXX: we're doing two index lookups when we end up reading the
|
||||
* folio
|
||||
*/
|
||||
ret = range_has_data(c, inode->ei_subvol,
|
||||
ret = range_has_data(c, inode->ei_inum.subvol,
|
||||
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)),
|
||||
POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS));
|
||||
if (ret <= 0)
|
||||
@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans,
|
||||
inode->ei_subvol, &snapshot);
|
||||
inode->ei_inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto bkey_err;
|
||||
|
||||
@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode,
|
||||
u64 start, u64 end)
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
u32 snapshot;
|
||||
u64 sectors = end - start;
|
||||
u64 pos = start;
|
||||
int ret;
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
||||
SPOS(inode->v.i_ino, pos, snapshot), 0);
|
||||
|
||||
while (!(ret = btree_trans_too_many_iters(trans)) &&
|
||||
(k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
|
||||
!(ret = bkey_err(k))) {
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter,
|
||||
BTREE_ID_extents,
|
||||
POS(inode->v.i_ino, start),
|
||||
POS(inode->v.i_ino, end - 1),
|
||||
inode->ei_inum.subvol, 0, k, ({
|
||||
if (bkey_extent_is_allocation(k.k)) {
|
||||
u64 s = min(end, k.k->p.offset) -
|
||||
max(start, bkey_start_offset(k.k));
|
||||
BUG_ON(s > sectors);
|
||||
sectors -= s;
|
||||
}
|
||||
bch2_btree_iter_advance(&iter);
|
||||
}
|
||||
pos = iter.pos.offset;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
bch2_trans_put(trans);
|
||||
0;
|
||||
})));
|
||||
|
||||
return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
|
||||
}
|
||||
@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
|
||||
{
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct btree_trans *trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
subvol_inum inum = inode_inum(inode);
|
||||
u64 isize, next_data = MAX_LFS_FILESIZE;
|
||||
u32 snapshot;
|
||||
int ret;
|
||||
|
||||
isize = i_size_read(&inode->v);
|
||||
if (offset >= isize)
|
||||
return -ENXIO;
|
||||
|
||||
trans = bch2_trans_get(c);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
|
||||
SPOS(inode->v.i_ino, offset >> 9, snapshot),
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
|
||||
POS(inode->v.i_ino, offset >> 9),
|
||||
POS(inode->v.i_ino, U64_MAX),
|
||||
0, k, ret) {
|
||||
inum.subvol, 0, k, ({
|
||||
if (bkey_extent_is_data(k.k)) {
|
||||
next_data = max(offset, bkey_start_offset(k.k) << 9);
|
||||
break;
|
||||
} else if (k.k->p.offset >> 9 > isize)
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
bch2_trans_put(trans);
|
||||
0;
|
||||
})));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -995,29 +941,18 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
|
||||
{
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct btree_trans *trans;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
subvol_inum inum = inode_inum(inode);
|
||||
u64 isize, next_hole = MAX_LFS_FILESIZE;
|
||||
u32 snapshot;
|
||||
int ret;
|
||||
|
||||
isize = i_size_read(&inode->v);
|
||||
if (offset >= isize)
|
||||
return -ENXIO;
|
||||
|
||||
trans = bch2_trans_get(c);
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
|
||||
SPOS(inode->v.i_ino, offset >> 9, snapshot),
|
||||
BTREE_ITER_slots, k, ret) {
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents,
|
||||
POS(inode->v.i_ino, offset >> 9),
|
||||
POS(inode->v.i_ino, U64_MAX),
|
||||
inum.subvol, BTREE_ITER_slots, k, ({
|
||||
if (k.k->p.inode != inode->v.i_ino) {
|
||||
next_hole = bch2_seek_pagecache_hole(&inode->v,
|
||||
offset, MAX_LFS_FILESIZE, 0, false);
|
||||
@ -1032,13 +967,8 @@ retry:
|
||||
} else {
|
||||
offset = max(offset, bkey_start_offset(k.k) << 9);
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
bch2_trans_put(trans);
|
||||
0;
|
||||
})));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c,
|
||||
}
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
|
||||
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
||||
bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
|
||||
ATTR_CTIME);
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
|
||||
}
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
|
||||
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
||||
bch2_set_projid(c, inode, fa.fsx_projid) ?:
|
||||
bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
|
||||
ATTR_CTIME);
|
||||
|
326
fs/bcachefs/fs.c
326
fs/bcachefs/fs.c
@ -108,7 +108,7 @@ retry:
|
||||
goto retry;
|
||||
|
||||
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
|
||||
"%s: inode %u:%llu not found when updating",
|
||||
"%s: inode %llu:%llu not found when updating",
|
||||
bch2_err_str(ret),
|
||||
inode_inum(inode).subvol,
|
||||
inode_inum(inode).inum);
|
||||
@ -152,42 +152,101 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_iget5_test(struct inode *vinode, void *p)
|
||||
static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(vinode);
|
||||
subvol_inum *inum = p;
|
||||
|
||||
return inode->ei_subvol == inum->subvol &&
|
||||
inode->ei_inode.bi_inum == inum->inum;
|
||||
return a.subvol == b.subvol && a.inum == b.inum;
|
||||
}
|
||||
|
||||
static int bch2_iget5_set(struct inode *vinode, void *p)
|
||||
static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
|
||||
const void *obj)
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(vinode);
|
||||
subvol_inum *inum = p;
|
||||
const struct bch_inode_info *inode = obj;
|
||||
const subvol_inum *v = arg->key;
|
||||
|
||||
inode->v.i_ino = inum->inum;
|
||||
inode->ei_subvol = inum->subvol;
|
||||
inode->ei_inode.bi_inum = inum->inum;
|
||||
return 0;
|
||||
return !subvol_inum_eq(inode->ei_inum, *v);
|
||||
}
|
||||
|
||||
static unsigned bch2_inode_hash(subvol_inum inum)
|
||||
static const struct rhashtable_params bch2_vfs_inodes_params = {
|
||||
.head_offset = offsetof(struct bch_inode_info, hash),
|
||||
.key_offset = offsetof(struct bch_inode_info, ei_inum),
|
||||
.key_len = sizeof(subvol_inum),
|
||||
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
|
||||
.automatic_shrinking = true,
|
||||
};
|
||||
|
||||
static void __wait_on_freeing_inode(struct inode *inode)
|
||||
{
|
||||
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
|
||||
wait_queue_head_t *wq;
|
||||
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
|
||||
wq = bit_waitqueue(&inode->i_state, __I_NEW);
|
||||
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
|
||||
spin_unlock(&inode->i_lock);
|
||||
schedule();
|
||||
finish_wait(wq, &wait.wq_entry);
|
||||
}
|
||||
|
||||
static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
|
||||
static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
|
||||
subvol_inum inum)
|
||||
{
|
||||
subvol_inum inum = inode_inum(inode);
|
||||
struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
|
||||
bch2_inode_hash(inum),
|
||||
bch2_iget5_test,
|
||||
bch2_iget5_set,
|
||||
&inum));
|
||||
BUG_ON(!old);
|
||||
struct bch_inode_info *inode;
|
||||
repeat:
|
||||
inode = rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
|
||||
if (inode) {
|
||||
spin_lock(&inode->v.i_lock);
|
||||
if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) {
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
return NULL;
|
||||
}
|
||||
if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
|
||||
if (!trans) {
|
||||
__wait_on_freeing_inode(&inode->v);
|
||||
} else {
|
||||
bch2_trans_unlock(trans);
|
||||
__wait_on_freeing_inode(&inode->v);
|
||||
int ret = bch2_trans_relock(trans);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
goto repeat;
|
||||
}
|
||||
__iget(&inode->v);
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode)
|
||||
{
|
||||
spin_lock(&inode->v.i_lock);
|
||||
bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
|
||||
if (remove) {
|
||||
int ret = rhashtable_remove_fast(&c->vfs_inodes_table,
|
||||
&inode->hash, bch2_vfs_inodes_params);
|
||||
BUG_ON(ret);
|
||||
inode->v.i_hash.pprev = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
|
||||
struct btree_trans *trans,
|
||||
struct bch_inode_info *inode)
|
||||
{
|
||||
struct bch_inode_info *old = inode;
|
||||
|
||||
set_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
retry:
|
||||
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
|
||||
&inode->hash,
|
||||
bch2_vfs_inodes_params))) {
|
||||
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
|
||||
if (!old)
|
||||
goto retry;
|
||||
|
||||
clear_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
|
||||
if (unlikely(old != inode)) {
|
||||
/*
|
||||
* bcachefs doesn't use I_NEW; we have no use for it since we
|
||||
* only insert fully created inodes in the inode hash table. But
|
||||
@ -201,21 +260,17 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
|
||||
*/
|
||||
set_nlink(&inode->v, 1);
|
||||
discard_new_inode(&inode->v);
|
||||
inode = old;
|
||||
return old;
|
||||
} else {
|
||||
inode_fake_hash(&inode->v);
|
||||
|
||||
inode_sb_list_add(&inode->v);
|
||||
|
||||
mutex_lock(&c->vfs_inodes_lock);
|
||||
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
|
||||
mutex_unlock(&c->vfs_inodes_lock);
|
||||
/*
|
||||
* Again, I_NEW makes no sense for bcachefs. This is only needed
|
||||
* for clearing I_NEW, but since the inode was already fully
|
||||
* created and initialized we didn't actually want
|
||||
* inode_insert5() to set it for us.
|
||||
*/
|
||||
unlock_new_inode(&inode->v);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
}
|
||||
|
||||
#define memalloc_flags_do(_flags, _do) \
|
||||
@ -233,7 +288,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
|
||||
|
||||
static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
|
||||
{
|
||||
struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
|
||||
struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
|
||||
bch2_inode_cache, GFP_NOFS);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
@ -275,13 +331,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
|
||||
return inode;
|
||||
}
|
||||
|
||||
static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans,
|
||||
subvol_inum inum,
|
||||
struct bch_inode_unpacked *bi,
|
||||
struct bch_subvolume *subvol)
|
||||
{
|
||||
struct bch_inode_info *inode = bch2_new_inode(trans);
|
||||
if (IS_ERR(inode))
|
||||
return inode;
|
||||
|
||||
bch2_vfs_inode_init(trans, inum, inode, bi, subvol);
|
||||
|
||||
return bch2_inode_hash_insert(trans->c, trans, inode);
|
||||
|
||||
}
|
||||
|
||||
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
|
||||
{
|
||||
struct bch_inode_info *inode =
|
||||
to_bch_ei(ilookup5_nowait(c->vfs_sb,
|
||||
bch2_inode_hash(inum),
|
||||
bch2_iget5_test,
|
||||
&inum));
|
||||
struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum);
|
||||
if (inode)
|
||||
return &inode->v;
|
||||
|
||||
@ -292,11 +359,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
|
||||
int ret = lockrestart_do(trans,
|
||||
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
|
||||
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
|
||||
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
|
||||
if (!ret) {
|
||||
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
|
||||
inode = bch2_inode_insert(c, inode);
|
||||
}
|
||||
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
|
||||
bch2_trans_put(trans);
|
||||
|
||||
return ret ? ERR_PTR(ret) : &inode->v;
|
||||
@ -317,6 +380,8 @@ __bch2_create(struct mnt_idmap *idmap,
|
||||
subvol_inum inum;
|
||||
struct bch_subvolume subvol;
|
||||
u64 journal_seq = 0;
|
||||
kuid_t kuid;
|
||||
kgid_t kgid;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
@ -343,13 +408,15 @@ __bch2_create(struct mnt_idmap *idmap,
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?:
|
||||
kuid = mapped_fsuid(idmap, i_user_ns(&dir->v));
|
||||
kgid = mapped_fsgid(idmap, i_user_ns(&dir->v));
|
||||
ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?:
|
||||
bch2_create_trans(trans,
|
||||
inode_inum(dir), &dir_u, &inode_u,
|
||||
!(flags & BCH_CREATE_TMPFILE)
|
||||
? &dentry->d_name : NULL,
|
||||
from_kuid(i_user_ns(&dir->v), current_fsuid()),
|
||||
from_kgid(i_user_ns(&dir->v), current_fsgid()),
|
||||
from_kuid(i_user_ns(&dir->v), kuid),
|
||||
from_kgid(i_user_ns(&dir->v), kgid),
|
||||
mode, rdev,
|
||||
default_acl, acl, snapshot_src, flags) ?:
|
||||
bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
|
||||
@ -357,7 +424,7 @@ retry:
|
||||
if (unlikely(ret))
|
||||
goto err_before_quota;
|
||||
|
||||
inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
|
||||
inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol;
|
||||
inum.inum = inode_u.bi_inum;
|
||||
|
||||
ret = bch2_subvolume_get(trans, inum.subvol, true,
|
||||
@ -387,8 +454,16 @@ err_before_quota:
|
||||
* we must insert the new inode into the inode cache before calling
|
||||
* bch2_trans_exit() and dropping locks, else we could race with another
|
||||
* thread pulling the inode in and modifying it:
|
||||
*
|
||||
* also, calling bch2_inode_hash_insert() without passing in the
|
||||
* transaction object is sketchy - if we could ever end up in
|
||||
* __wait_on_freeing_inode(), we'd risk deadlock.
|
||||
*
|
||||
* But that shouldn't be possible, since we still have the inode locked
|
||||
* that we just created, and we _really_ can't take a transaction
|
||||
* restart here.
|
||||
*/
|
||||
inode = bch2_inode_insert(c, inode);
|
||||
inode = bch2_inode_hash_insert(c, NULL, inode);
|
||||
bch2_trans_put(trans);
|
||||
err:
|
||||
posix_acl_release(default_acl);
|
||||
@ -428,11 +503,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
struct bch_inode_info *inode =
|
||||
to_bch_ei(ilookup5_nowait(c->vfs_sb,
|
||||
bch2_inode_hash(inum),
|
||||
bch2_iget5_test,
|
||||
&inum));
|
||||
struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum);
|
||||
if (inode)
|
||||
goto out;
|
||||
|
||||
@ -440,7 +511,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
||||
struct bch_inode_unpacked inode_u;
|
||||
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
|
||||
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
|
||||
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
|
||||
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
|
||||
|
||||
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
|
||||
c, "dirent to missing inode:\n %s",
|
||||
@ -460,9 +531,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
|
||||
ret = -ENOENT;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
|
||||
inode = bch2_inode_insert(c, inode);
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &dirent_iter);
|
||||
printbuf_exit(&buf);
|
||||
@ -549,8 +617,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
|
||||
|
||||
lockdep_assert_held(&inode->v.i_rwsem);
|
||||
|
||||
ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
|
||||
bch2_subvol_is_ro(c, inode->ei_subvol) ?:
|
||||
ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
|
||||
bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
||||
__bch2_link(c, inode, dir, dentry);
|
||||
if (unlikely(ret))
|
||||
return bch2_err_class(ret);
|
||||
@ -606,7 +674,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
|
||||
struct bch_inode_info *dir= to_bch_ei(vdir);
|
||||
struct bch_fs *c = dir->v.i_sb->s_fs_info;
|
||||
|
||||
int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
|
||||
int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?:
|
||||
__bch2_unlink(vdir, dentry, false);
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
@ -689,8 +757,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
|
||||
|
||||
trans = bch2_trans_get(c);
|
||||
|
||||
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?:
|
||||
bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol);
|
||||
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
|
||||
bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -771,11 +839,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
unsigned int ia_valid = attr->ia_valid;
|
||||
kuid_t kuid;
|
||||
kgid_t kgid;
|
||||
|
||||
if (ia_valid & ATTR_UID)
|
||||
bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
|
||||
if (ia_valid & ATTR_GID)
|
||||
bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
|
||||
if (ia_valid & ATTR_UID) {
|
||||
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
|
||||
bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid);
|
||||
}
|
||||
if (ia_valid & ATTR_GID) {
|
||||
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
|
||||
bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid);
|
||||
}
|
||||
|
||||
if (ia_valid & ATTR_SIZE)
|
||||
bi->bi_size = attr->ia_size;
|
||||
@ -790,11 +864,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap,
|
||||
if (ia_valid & ATTR_MODE) {
|
||||
umode_t mode = attr->ia_mode;
|
||||
kgid_t gid = ia_valid & ATTR_GID
|
||||
? attr->ia_gid
|
||||
? kgid
|
||||
: inode->v.i_gid;
|
||||
|
||||
if (!in_group_p(gid) &&
|
||||
!capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID))
|
||||
if (!in_group_or_capable(idmap, &inode->v,
|
||||
make_vfsgid(idmap, i_user_ns(&inode->v), gid)))
|
||||
mode &= ~S_ISGID;
|
||||
bi->bi_mode = mode;
|
||||
}
|
||||
@ -810,17 +884,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
|
||||
struct btree_iter inode_iter = { NULL };
|
||||
struct bch_inode_unpacked inode_u;
|
||||
struct posix_acl *acl = NULL;
|
||||
kuid_t kuid;
|
||||
kgid_t kgid;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
|
||||
qid = inode->ei_qid;
|
||||
|
||||
if (attr->ia_valid & ATTR_UID)
|
||||
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid);
|
||||
if (attr->ia_valid & ATTR_UID) {
|
||||
kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid);
|
||||
qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid);
|
||||
}
|
||||
|
||||
if (attr->ia_valid & ATTR_GID)
|
||||
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid);
|
||||
if (attr->ia_valid & ATTR_GID) {
|
||||
kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid);
|
||||
qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid);
|
||||
}
|
||||
|
||||
ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
|
||||
KEY_TYPE_QUOTA_PREALLOC);
|
||||
@ -876,13 +956,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v);
|
||||
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v);
|
||||
|
||||
stat->dev = inode->v.i_sb->s_dev;
|
||||
stat->ino = inode->v.i_ino;
|
||||
stat->mode = inode->v.i_mode;
|
||||
stat->nlink = inode->v.i_nlink;
|
||||
stat->uid = inode->v.i_uid;
|
||||
stat->gid = inode->v.i_gid;
|
||||
stat->uid = vfsuid_into_kuid(vfsuid);
|
||||
stat->gid = vfsgid_into_kgid(vfsgid);
|
||||
stat->rdev = inode->v.i_rdev;
|
||||
stat->size = i_size_read(&inode->v);
|
||||
stat->atime = inode_get_atime(&inode->v);
|
||||
@ -891,7 +973,7 @@ static int bch2_getattr(struct mnt_idmap *idmap,
|
||||
stat->blksize = block_bytes(c);
|
||||
stat->blocks = inode->v.i_blocks;
|
||||
|
||||
stat->subvol = inode->ei_subvol;
|
||||
stat->subvol = inode->ei_inum.subvol;
|
||||
stat->result_mask |= STATX_SUBVOL;
|
||||
|
||||
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) {
|
||||
@ -933,7 +1015,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
|
||||
|
||||
lockdep_assert_held(&inode->v.i_rwsem);
|
||||
|
||||
ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?:
|
||||
ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
|
||||
setattr_prepare(idmap, dentry, iattr);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -1026,7 +1108,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
|
||||
struct bkey_buf cur, prev;
|
||||
unsigned offset_into_extent, sectors;
|
||||
bool have_extent = false;
|
||||
u32 snapshot;
|
||||
int ret = 0;
|
||||
|
||||
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
|
||||
@ -1042,20 +1123,29 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
|
||||
bch2_bkey_buf_init(&cur);
|
||||
bch2_bkey_buf_init(&prev);
|
||||
trans = bch2_trans_get(c);
|
||||
retry:
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
||||
POS(ei->v.i_ino, start), 0);
|
||||
|
||||
while (true) {
|
||||
enum btree_id data_btree = BTREE_ID_extents;
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
|
||||
u32 snapshot;
|
||||
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
|
||||
SPOS(ei->v.i_ino, start, snapshot), 0);
|
||||
bch2_btree_iter_set_snapshot(&iter, snapshot);
|
||||
|
||||
while (!(ret = btree_trans_too_many_iters(trans)) &&
|
||||
(k = bch2_btree_iter_peek_upto(&iter, end)).k &&
|
||||
!(ret = bkey_err(k))) {
|
||||
enum btree_id data_btree = BTREE_ID_extents;
|
||||
k = bch2_btree_iter_peek_upto(&iter, end);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (!k.k)
|
||||
break;
|
||||
|
||||
if (!bkey_extent_is_data(k.k) &&
|
||||
k.k->type != KEY_TYPE_reservation) {
|
||||
@ -1100,16 +1190,12 @@ retry:
|
||||
|
||||
bch2_btree_iter_set_pos(&iter,
|
||||
POS(iter.pos.inode, iter.pos.offset + sectors));
|
||||
|
||||
ret = bch2_trans_relock(trans);
|
||||
if (ret)
|
||||
err:
|
||||
if (ret &&
|
||||
!bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
break;
|
||||
}
|
||||
start = iter.pos.offset;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
if (!ret && have_extent) {
|
||||
bch2_trans_unlock(trans);
|
||||
@ -1165,7 +1251,7 @@ static int bch2_open(struct inode *vinode, struct file *file)
|
||||
struct bch_inode_info *inode = to_bch_ei(vinode);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
|
||||
int ret = bch2_subvol_is_ro(c, inode->ei_subvol);
|
||||
int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -1297,8 +1383,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type)
|
||||
static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
|
||||
{
|
||||
return (struct bcachefs_fid) {
|
||||
.inum = inode->ei_inode.bi_inum,
|
||||
.subvol = inode->ei_subvol,
|
||||
.inum = inode->ei_inum.inum,
|
||||
.subvol = inode->ei_inum.subvol,
|
||||
.gen = inode->ei_inode.bi_generation,
|
||||
};
|
||||
}
|
||||
@ -1383,7 +1469,7 @@ static struct dentry *bch2_get_parent(struct dentry *child)
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
subvol_inum parent_inum = {
|
||||
.subvol = inode->ei_inode.bi_parent_subvol ?:
|
||||
inode->ei_subvol,
|
||||
inode->ei_inum.subvol,
|
||||
.inum = inode->ei_inode.bi_dir,
|
||||
};
|
||||
|
||||
@ -1419,7 +1505,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
|
||||
ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -1450,8 +1536,7 @@ retry:
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (target.subvol == inode->ei_subvol &&
|
||||
target.inum == inode->ei_inode.bi_inum)
|
||||
if (subvol_inum_eq(target, inode->ei_inum))
|
||||
goto found;
|
||||
} else {
|
||||
/*
|
||||
@ -1472,8 +1557,7 @@ retry:
|
||||
if (ret)
|
||||
continue;
|
||||
|
||||
if (target.subvol == inode->ei_subvol &&
|
||||
target.inum == inode->ei_inode.bi_inum)
|
||||
if (subvol_inum_eq(target, inode->ei_inum))
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
@ -1505,12 +1589,15 @@ static const struct export_operations bch_export_ops = {
|
||||
.get_name = bch2_get_name,
|
||||
};
|
||||
|
||||
static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
|
||||
static void bch2_vfs_inode_init(struct btree_trans *trans,
|
||||
subvol_inum inum,
|
||||
struct bch_inode_info *inode,
|
||||
struct bch_inode_unpacked *bi,
|
||||
struct bch_subvolume *subvol)
|
||||
{
|
||||
bch2_iget5_set(&inode->v, &inum);
|
||||
inode->v.i_ino = inum.inum;
|
||||
inode->ei_inum = inum;
|
||||
inode->ei_inode.bi_inum = inum.inum;
|
||||
bch2_inode_update_after_write(trans, inode, bi, ~0);
|
||||
|
||||
inode->v.i_blocks = bi->bi_sectors;
|
||||
@ -1522,7 +1609,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
|
||||
inode->ei_flags = 0;
|
||||
inode->ei_quota_reserved = 0;
|
||||
inode->ei_qid = bch_qid(bi);
|
||||
inode->ei_subvol = inum.subvol;
|
||||
|
||||
if (BCH_SUBVOLUME_SNAP(subvol))
|
||||
set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
|
||||
@ -1590,6 +1676,12 @@ static void bch2_evict_inode(struct inode *vinode)
|
||||
struct bch_fs *c = vinode->i_sb->s_fs_info;
|
||||
struct bch_inode_info *inode = to_bch_ei(vinode);
|
||||
|
||||
/*
|
||||
* evict() has waited for outstanding writeback, we'll do no more IO
|
||||
* through this inode: it's safe to remove from VFS inode hashtable here
|
||||
*/
|
||||
bch2_inode_hash_remove(c, inode);
|
||||
|
||||
truncate_inode_pages_final(&inode->v.i_data);
|
||||
|
||||
clear_inode(&inode->v);
|
||||
@ -1631,7 +1723,7 @@ again:
|
||||
|
||||
mutex_lock(&c->vfs_inodes_lock);
|
||||
list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) {
|
||||
if (!snapshot_list_has_id(s, inode->ei_subvol))
|
||||
if (!snapshot_list_has_id(s, inode->ei_inum.subvol))
|
||||
continue;
|
||||
|
||||
if (!(inode->v.i_state & I_DONTCACHE) &&
|
||||
@ -2119,12 +2211,23 @@ static int bch2_init_fs_context(struct fs_context *fc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_fs_vfs_exit(struct bch_fs *c)
|
||||
{
|
||||
if (c->vfs_inodes_table.tbl)
|
||||
rhashtable_destroy(&c->vfs_inodes_table);
|
||||
}
|
||||
|
||||
int bch2_fs_vfs_init(struct bch_fs *c)
|
||||
{
|
||||
return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params);
|
||||
}
|
||||
|
||||
static struct file_system_type bcache_fs_type = {
|
||||
.owner = THIS_MODULE,
|
||||
.name = "bcachefs",
|
||||
.init_fs_context = bch2_init_fs_context,
|
||||
.kill_sb = bch2_kill_sb,
|
||||
.fs_flags = FS_REQUIRES_DEV,
|
||||
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
|
||||
};
|
||||
|
||||
MODULE_ALIAS_FS("bcachefs");
|
||||
@ -2139,7 +2242,8 @@ int __init bch2_vfs_init(void)
|
||||
{
|
||||
int ret = -ENOMEM;
|
||||
|
||||
bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT);
|
||||
bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT |
|
||||
SLAB_ACCOUNT);
|
||||
if (!bch2_inode_cache)
|
||||
goto err;
|
||||
|
||||
|
@ -13,6 +13,9 @@
|
||||
|
||||
struct bch_inode_info {
|
||||
struct inode v;
|
||||
struct rhash_head hash;
|
||||
subvol_inum ei_inum;
|
||||
|
||||
struct list_head ei_vfs_inode_list;
|
||||
unsigned long ei_flags;
|
||||
|
||||
@ -24,8 +27,6 @@ struct bch_inode_info {
|
||||
struct mutex ei_quota_lock;
|
||||
struct bch_qid ei_qid;
|
||||
|
||||
u32 ei_subvol;
|
||||
|
||||
/*
|
||||
* When we've been doing nocow writes we'll need to issue flushes to the
|
||||
* underlying block devices
|
||||
@ -50,10 +51,7 @@ struct bch_inode_info {
|
||||
|
||||
static inline subvol_inum inode_inum(struct bch_inode_info *inode)
|
||||
{
|
||||
return (subvol_inum) {
|
||||
.subvol = inode->ei_subvol,
|
||||
.inum = inode->ei_inode.bi_inum,
|
||||
};
|
||||
return inode->ei_inum;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -67,6 +65,7 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
|
||||
* those:
|
||||
*/
|
||||
#define EI_INODE_SNAPSHOT 1
|
||||
#define EI_INODE_HASHED 2
|
||||
|
||||
#define to_bch_ei(_inode) \
|
||||
container_of_or_null(_inode, struct bch_inode_info, v)
|
||||
@ -187,6 +186,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool);
|
||||
|
||||
void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
|
||||
|
||||
void bch2_fs_vfs_exit(struct bch_fs *);
|
||||
int bch2_fs_vfs_init(struct bch_fs *);
|
||||
|
||||
void bch2_vfs_exit(void);
|
||||
int bch2_vfs_init(void);
|
||||
|
||||
@ -196,6 +198,10 @@ int bch2_vfs_init(void);
|
||||
|
||||
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
|
||||
snapshot_id_list *s) {}
|
||||
|
||||
static inline void bch2_fs_vfs_exit(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; }
|
||||
|
||||
static inline void bch2_vfs_exit(void) {}
|
||||
static inline int bch2_vfs_init(void) { return 0; }
|
||||
|
||||
|
@ -365,7 +365,7 @@ int bch2_inode_peek(struct btree_trans *trans,
|
||||
subvol_inum inum, unsigned flags)
|
||||
{
|
||||
int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
|
||||
bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
|
||||
bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
|
||||
*/
|
||||
bool promote_full = (failed ||
|
||||
*read_full ||
|
||||
READ_ONCE(c->promote_whole_extents));
|
||||
READ_ONCE(c->opts.promote_whole_extents));
|
||||
/* data might have to be decompressed in the write path: */
|
||||
unsigned sectors = promote_full
|
||||
? max(pick->crc.compressed_size, pick->crc.live_size)
|
||||
@ -1214,10 +1214,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
||||
|
||||
swap(bvec_iter.bi_size, bytes);
|
||||
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
|
||||
|
||||
ret = btree_trans_too_many_iters(trans);
|
||||
if (ret)
|
||||
goto err;
|
||||
err:
|
||||
if (ret &&
|
||||
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
||||
|
@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c,
|
||||
genradix_for_each(&c->journal_entries, radix_iter, _i) {
|
||||
struct bch_replicas_padded replicas = {
|
||||
.e.data_type = BCH_DATA_journal,
|
||||
.e.nr_devs = 0,
|
||||
.e.nr_required = 1,
|
||||
};
|
||||
|
||||
@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c,
|
||||
goto err;
|
||||
|
||||
darray_for_each(i->ptrs, ptr)
|
||||
replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
|
||||
replicas_entry_add_dev(&replicas.e, ptr->dev);
|
||||
|
||||
bch2_replicas_entry_sort(&replicas.e);
|
||||
|
||||
@ -1950,7 +1951,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
|
||||
if (error ||
|
||||
w->noflush ||
|
||||
(!w->must_flush &&
|
||||
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
|
||||
time_before(jiffies, j->last_flush_write +
|
||||
msecs_to_jiffies(c->opts.journal_flush_delay)) &&
|
||||
test_bit(JOURNAL_may_skip_flush, &j->flags))) {
|
||||
w->noflush = true;
|
||||
SET_JSET_NO_FLUSH(w->data, true);
|
||||
|
@ -230,6 +230,8 @@ const struct bch_option bch2_opt_table[] = {
|
||||
#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
|
||||
.min = 0, .max = U64_MAX, \
|
||||
.choices = _choices
|
||||
#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \
|
||||
.choices = _choices
|
||||
#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn
|
||||
|
||||
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
|
||||
@ -376,6 +378,13 @@ int bch2_opt_parse(struct bch_fs *c,
|
||||
|
||||
*res = ret;
|
||||
break;
|
||||
case BCH_OPT_BITFIELD: {
|
||||
s64 v = bch2_read_flag_list(val, opt->choices);
|
||||
if (v < 0)
|
||||
return v;
|
||||
*res = v;
|
||||
break;
|
||||
}
|
||||
case BCH_OPT_FN:
|
||||
ret = opt->fn.parse(c, val, res, err);
|
||||
|
||||
@ -608,10 +617,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
|
||||
struct bch_dev_sb_opt_set {
|
||||
void (*set_sb)(struct bch_member *, u64);
|
||||
};
|
||||
|
||||
static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
|
||||
#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
|
||||
BCH_DEV_OPT_SETTERS()
|
||||
#undef x
|
||||
};
|
||||
|
||||
void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
|
||||
const struct bch_option *opt, u64 v)
|
||||
{
|
||||
if (opt->set_sb == SET_BCH2_NO_SB_OPT)
|
||||
return;
|
||||
enum bch_opt_id id = opt - bch2_opt_table;
|
||||
|
||||
if (opt->flags & OPT_SB_FIELD_SECTORS)
|
||||
v >>= 9;
|
||||
@ -619,16 +638,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
|
||||
if (opt->flags & OPT_SB_FIELD_ILOG2)
|
||||
v = ilog2(v);
|
||||
|
||||
opt->set_sb(sb, v);
|
||||
}
|
||||
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
|
||||
v++;
|
||||
|
||||
void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
|
||||
{
|
||||
if (opt->set_sb == SET_BCH2_NO_SB_OPT)
|
||||
if (opt->flags & OPT_FS) {
|
||||
if (opt->set_sb != SET_BCH2_NO_SB_OPT)
|
||||
opt->set_sb(sb, v);
|
||||
}
|
||||
|
||||
if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
|
||||
if (WARN(!bch2_member_exists(sb, dev_idx),
|
||||
"tried to set device option %s on nonexistent device %i",
|
||||
opt->attr.name, dev_idx))
|
||||
return;
|
||||
|
||||
struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
|
||||
|
||||
const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
|
||||
if (set->set_sb)
|
||||
set->set_sb(m, v);
|
||||
else
|
||||
pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
|
||||
const struct bch_option *opt, u64 v)
|
||||
{
|
||||
mutex_lock(&c->sb_lock);
|
||||
__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
|
||||
__bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
}
|
||||
|
@ -53,23 +53,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
|
||||
|
||||
/* When can be set: */
|
||||
enum opt_flags {
|
||||
OPT_FS = (1 << 0), /* Filesystem option */
|
||||
OPT_DEVICE = (1 << 1), /* Device option */
|
||||
OPT_INODE = (1 << 2), /* Inode option */
|
||||
OPT_FORMAT = (1 << 3), /* May be specified at format time */
|
||||
OPT_MOUNT = (1 << 4), /* May be specified at mount time */
|
||||
OPT_RUNTIME = (1 << 5), /* May be specified at runtime */
|
||||
OPT_HUMAN_READABLE = (1 << 6),
|
||||
OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */
|
||||
OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
|
||||
OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */
|
||||
OPT_HIDDEN = (1 << 10),
|
||||
OPT_FS = BIT(0), /* Filesystem option */
|
||||
OPT_DEVICE = BIT(1), /* Device option */
|
||||
OPT_INODE = BIT(2), /* Inode option */
|
||||
OPT_FORMAT = BIT(3), /* May be specified at format time */
|
||||
OPT_MOUNT = BIT(4), /* May be specified at mount time */
|
||||
OPT_RUNTIME = BIT(5), /* May be specified at runtime */
|
||||
OPT_HUMAN_READABLE = BIT(6),
|
||||
OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */
|
||||
OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */
|
||||
OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */
|
||||
OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */
|
||||
OPT_HIDDEN = BIT(11),
|
||||
};
|
||||
|
||||
enum opt_type {
|
||||
BCH_OPT_BOOL,
|
||||
BCH_OPT_UINT,
|
||||
BCH_OPT_STR,
|
||||
BCH_OPT_BITFIELD,
|
||||
BCH_OPT_FN,
|
||||
};
|
||||
|
||||
@ -263,6 +265,11 @@ enum fsck_err_opts {
|
||||
OPT_BOOL(), \
|
||||
BCH2_NO_SB_OPT, true, \
|
||||
NULL, "Enable inline data extents") \
|
||||
x(promote_whole_extents, u8, \
|
||||
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \
|
||||
NULL, "Promote whole extents, instead of just part being read")\
|
||||
x(acl, u8, \
|
||||
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
@ -472,11 +479,16 @@ enum fsck_err_opts {
|
||||
BCH2_NO_SB_OPT, 0, \
|
||||
"size", "Size of filesystem on device") \
|
||||
x(durability, u8, \
|
||||
OPT_DEVICE, \
|
||||
OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
|
||||
OPT_UINT(0, BCH_REPLICAS_MAX), \
|
||||
BCH2_NO_SB_OPT, 1, \
|
||||
"n", "Data written to this device will be considered\n"\
|
||||
"to have already been replicated n times") \
|
||||
x(data_allowed, u8, \
|
||||
OPT_DEVICE, \
|
||||
OPT_BITFIELD(__bch2_data_types), \
|
||||
BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
|
||||
"types", "Allowed data types for this device: journal, btree, and/or user")\
|
||||
x(btree_node_prefetch, u8, \
|
||||
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
@ -484,6 +496,11 @@ enum fsck_err_opts {
|
||||
NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
|
||||
" prefetched sequentially")
|
||||
|
||||
#define BCH_DEV_OPT_SETTERS() \
|
||||
x(discard, BCH_MEMBER_DISCARD) \
|
||||
x(durability, BCH_MEMBER_DURABILITY) \
|
||||
x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
|
||||
|
||||
struct bch_opts {
|
||||
#define x(_name, _bits, ...) unsigned _name##_defined:1;
|
||||
BCH_OPTS()
|
||||
@ -563,8 +580,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
|
||||
|
||||
u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
|
||||
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
|
||||
void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
|
||||
void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
|
||||
void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
|
||||
|
||||
struct bch_dev;
|
||||
void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
|
||||
|
||||
int bch2_opt_lookup(const char *);
|
||||
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
|
||||
|
650
fs/bcachefs/rcu_pending.c
Normal file
650
fs/bcachefs/rcu_pending.c
Normal file
@ -0,0 +1,650 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
|
||||
|
||||
#include <linux/generic-radix-tree.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include "rcu_pending.h"
|
||||
#include "darray.h"
|
||||
#include "util.h"
|
||||
|
||||
#define static_array_for_each(_a, _i) \
|
||||
for (typeof(&(_a)[0]) _i = _a; \
|
||||
_i < (_a) + ARRAY_SIZE(_a); \
|
||||
_i++)
|
||||
|
||||
enum rcu_pending_special {
|
||||
RCU_PENDING_KVFREE = 1,
|
||||
RCU_PENDING_CALL_RCU = 2,
|
||||
};
|
||||
|
||||
#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
|
||||
#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
|
||||
|
||||
static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
|
||||
{
|
||||
return ssp
|
||||
? get_state_synchronize_srcu(ssp)
|
||||
: get_state_synchronize_rcu();
|
||||
}
|
||||
|
||||
static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
|
||||
{
|
||||
return ssp
|
||||
? start_poll_synchronize_srcu(ssp)
|
||||
: start_poll_synchronize_rcu();
|
||||
}
|
||||
|
||||
static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
|
||||
{
|
||||
return ssp
|
||||
? poll_state_synchronize_srcu(ssp, cookie)
|
||||
: poll_state_synchronize_rcu(cookie);
|
||||
}
|
||||
|
||||
static inline void __rcu_barrier(struct srcu_struct *ssp)
|
||||
{
|
||||
return ssp
|
||||
? srcu_barrier(ssp)
|
||||
: rcu_barrier();
|
||||
}
|
||||
|
||||
static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
||||
rcu_callback_t func)
|
||||
{
|
||||
if (ssp)
|
||||
call_srcu(ssp, rhp, func);
|
||||
else
|
||||
call_rcu(rhp, func);
|
||||
}
|
||||
|
||||
struct rcu_pending_seq {
|
||||
/*
|
||||
* We're using a radix tree like a vector - we're just pushing elements
|
||||
* onto the end; we're using a radix tree instead of an actual vector to
|
||||
* avoid reallocation overhead
|
||||
*/
|
||||
GENRADIX(struct rcu_head *) objs;
|
||||
size_t nr;
|
||||
struct rcu_head **cursor;
|
||||
unsigned long seq;
|
||||
};
|
||||
|
||||
struct rcu_pending_list {
|
||||
struct rcu_head *head;
|
||||
struct rcu_head *tail;
|
||||
unsigned long seq;
|
||||
};
|
||||
|
||||
struct rcu_pending_pcpu {
|
||||
struct rcu_pending *parent;
|
||||
spinlock_t lock;
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* We can't bound the number of unprocessed gp sequence numbers, and we
|
||||
* can't efficiently merge radix trees for expired grace periods, so we
|
||||
* need darray/vector:
|
||||
*/
|
||||
DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
|
||||
|
||||
/* Third entry is for expired objects: */
|
||||
struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
|
||||
|
||||
struct rcu_head cb;
|
||||
bool cb_armed;
|
||||
struct work_struct work;
|
||||
};
|
||||
|
||||
static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
|
||||
{
|
||||
if (p->objs.nr)
|
||||
return true;
|
||||
|
||||
static_array_for_each(p->lists, i)
|
||||
if (i->head)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void rcu_pending_list_merge(struct rcu_pending_list *l1,
|
||||
struct rcu_pending_list *l2)
|
||||
{
|
||||
#ifdef __KERNEL__
|
||||
if (!l1->head)
|
||||
l1->head = l2->head;
|
||||
else
|
||||
l1->tail->next = l2->head;
|
||||
#else
|
||||
if (!l1->head)
|
||||
l1->head = l2->head;
|
||||
else
|
||||
l1->tail->next.next = (void *) l2->head;
|
||||
#endif
|
||||
|
||||
l1->tail = l2->tail;
|
||||
l2->head = l2->tail = NULL;
|
||||
}
|
||||
|
||||
static void rcu_pending_list_add(struct rcu_pending_list *l,
|
||||
struct rcu_head *n)
|
||||
{
|
||||
#ifdef __KERNEL__
|
||||
if (!l->head)
|
||||
l->head = n;
|
||||
else
|
||||
l->tail->next = n;
|
||||
l->tail = n;
|
||||
n->next = NULL;
|
||||
#else
|
||||
if (!l->head)
|
||||
l->head = n;
|
||||
else
|
||||
l->tail->next.next = (void *) n;
|
||||
l->tail = n;
|
||||
n->next.next = NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void merge_expired_lists(struct rcu_pending_pcpu *p)
|
||||
{
|
||||
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
|
||||
|
||||
for (struct rcu_pending_list *i = p->lists; i < expired; i++)
|
||||
if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
|
||||
rcu_pending_list_merge(expired, i);
|
||||
}
|
||||
|
||||
#ifndef __KERNEL__
|
||||
static inline void kfree_bulk(size_t nr, void ** p)
|
||||
{
|
||||
while (nr--)
|
||||
kfree(*p);
|
||||
}
|
||||
|
||||
#define local_irq_save(flags) \
|
||||
do { \
|
||||
flags = 0; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
static noinline void __process_finished_items(struct rcu_pending *pending,
|
||||
struct rcu_pending_pcpu *p,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
|
||||
struct rcu_pending_seq objs = {};
|
||||
struct rcu_head *list = NULL;
|
||||
|
||||
if (p->objs.nr &&
|
||||
__poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
|
||||
objs = p->objs.data[0];
|
||||
darray_remove_item(&p->objs, p->objs.data);
|
||||
}
|
||||
|
||||
merge_expired_lists(p);
|
||||
|
||||
list = expired->head;
|
||||
expired->head = expired->tail = NULL;
|
||||
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
|
||||
switch ((ulong) pending->process) {
|
||||
case RCU_PENDING_KVFREE:
|
||||
for (size_t i = 0; i < objs.nr; ) {
|
||||
size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
|
||||
|
||||
kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
|
||||
i += nr_this_node;
|
||||
}
|
||||
genradix_free(&objs.objs);
|
||||
|
||||
while (list) {
|
||||
struct rcu_head *obj = list;
|
||||
#ifdef __KERNEL__
|
||||
list = obj->next;
|
||||
#else
|
||||
list = (void *) obj->next.next;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* low bit of pointer indicates whether rcu_head needs
|
||||
* to be freed - kvfree_rcu_mightsleep()
|
||||
*/
|
||||
BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
|
||||
|
||||
void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
|
||||
bool free_head = ((unsigned long) obj->func) & 1UL;
|
||||
|
||||
kvfree(ptr);
|
||||
if (free_head)
|
||||
kfree(obj);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case RCU_PENDING_CALL_RCU:
|
||||
for (size_t i = 0; i < objs.nr; i++) {
|
||||
struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
|
||||
obj->func(obj);
|
||||
}
|
||||
genradix_free(&objs.objs);
|
||||
|
||||
while (list) {
|
||||
struct rcu_head *obj = list;
|
||||
#ifdef __KERNEL__
|
||||
list = obj->next;
|
||||
#else
|
||||
list = (void *) obj->next.next;
|
||||
#endif
|
||||
obj->func(obj);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
for (size_t i = 0; i < objs.nr; i++)
|
||||
pending->process(pending, *genradix_ptr(&objs.objs, i));
|
||||
genradix_free(&objs.objs);
|
||||
|
||||
while (list) {
|
||||
struct rcu_head *obj = list;
|
||||
#ifdef __KERNEL__
|
||||
list = obj->next;
|
||||
#else
|
||||
list = (void *) obj->next.next;
|
||||
#endif
|
||||
pending->process(pending, obj);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static bool process_finished_items(struct rcu_pending *pending,
|
||||
struct rcu_pending_pcpu *p,
|
||||
unsigned long flags)
|
||||
{
|
||||
/*
|
||||
* XXX: we should grab the gp seq once and avoid multiple function
|
||||
* calls, this is called from __rcu_pending_enqueue() fastpath in
|
||||
* may_sleep==true mode
|
||||
*/
|
||||
if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
|
||||
(p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
|
||||
(p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
|
||||
p->lists[2].head) {
|
||||
__process_finished_items(pending, p, flags);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void rcu_pending_work(struct work_struct *work)
|
||||
{
|
||||
struct rcu_pending_pcpu *p =
|
||||
container_of(work, struct rcu_pending_pcpu, work);
|
||||
struct rcu_pending *pending = p->parent;
|
||||
unsigned long flags;
|
||||
|
||||
do {
|
||||
spin_lock_irqsave(&p->lock, flags);
|
||||
} while (process_finished_items(pending, p, flags));
|
||||
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
}
|
||||
|
||||
static void rcu_pending_rcu_cb(struct rcu_head *rcu)
|
||||
{
|
||||
struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
|
||||
|
||||
schedule_work_on(p->cpu, &p->work);
|
||||
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&p->lock, flags);
|
||||
if (__rcu_pending_has_pending(p)) {
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
|
||||
} else {
|
||||
p->cb_armed = false;
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline struct rcu_pending_seq *
|
||||
get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
|
||||
{
|
||||
darray_for_each_reverse(p->objs, objs)
|
||||
if (objs->seq == seq)
|
||||
return objs;
|
||||
|
||||
if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
|
||||
return NULL;
|
||||
|
||||
return &darray_last(p->objs);
|
||||
}
|
||||
|
||||
static noinline bool
|
||||
rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
|
||||
struct rcu_head *head, void *ptr,
|
||||
unsigned long *flags)
|
||||
{
|
||||
if (ptr) {
|
||||
if (!head) {
|
||||
/*
|
||||
* kvfree_rcu_mightsleep(): we weren't passed an
|
||||
* rcu_head, but we need one: use the low bit of the
|
||||
* ponter to free to flag that the head needs to be
|
||||
* freed as well:
|
||||
*/
|
||||
ptr = (void *)(((unsigned long) ptr)|1UL);
|
||||
head = kmalloc(sizeof(*head), __GFP_NOWARN);
|
||||
if (!head) {
|
||||
spin_unlock_irqrestore(&p->lock, *flags);
|
||||
head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
|
||||
/*
|
||||
* dropped lock, did GFP_KERNEL allocation,
|
||||
* check for gp expiration
|
||||
*/
|
||||
if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
|
||||
kvfree(--ptr);
|
||||
kfree(head);
|
||||
spin_lock_irqsave(&p->lock, *flags);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
head->func = ptr;
|
||||
}
|
||||
again:
|
||||
for (struct rcu_pending_list *i = p->lists;
|
||||
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
|
||||
if (i->seq == seq) {
|
||||
rcu_pending_list_add(i, head);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (struct rcu_pending_list *i = p->lists;
|
||||
i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
|
||||
if (!i->head) {
|
||||
i->seq = seq;
|
||||
rcu_pending_list_add(i, head);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
merge_expired_lists(p);
|
||||
goto again;
|
||||
}
|
||||
|
||||
/*
|
||||
* __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
|
||||
* pending->pracess) once grace period elapses.
|
||||
*
|
||||
* Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
|
||||
* back to a linked list.
|
||||
*
|
||||
* - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
|
||||
* process callback
|
||||
*
|
||||
* - If @ptr and @head are both not NULL, we're kvfree_rcu()
|
||||
*
|
||||
* - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
|
||||
*
|
||||
* - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
|
||||
* expired items.
|
||||
*/
|
||||
static __always_inline void
|
||||
__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
|
||||
void *ptr, bool may_sleep)
|
||||
{
|
||||
|
||||
struct rcu_pending_pcpu *p;
|
||||
struct rcu_pending_seq *objs;
|
||||
struct genradix_node *new_node = NULL;
|
||||
unsigned long seq, flags;
|
||||
bool start_gp = false;
|
||||
|
||||
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
|
||||
|
||||
local_irq_save(flags);
|
||||
p = this_cpu_ptr(pending->p);
|
||||
spin_lock(&p->lock);
|
||||
seq = __get_state_synchronize_rcu(pending->srcu);
|
||||
restart:
|
||||
if (may_sleep &&
|
||||
unlikely(process_finished_items(pending, p, flags)))
|
||||
goto check_expired;
|
||||
|
||||
/*
|
||||
* In kvfree_rcu() mode, the radix tree is only for slab pointers so
|
||||
* that we can do kfree_bulk() - vmalloc pointers always use the linked
|
||||
* list:
|
||||
*/
|
||||
if (ptr && unlikely(is_vmalloc_addr(ptr)))
|
||||
goto list_add;
|
||||
|
||||
objs = get_object_radix(p, seq);
|
||||
if (unlikely(!objs))
|
||||
goto list_add;
|
||||
|
||||
if (unlikely(!objs->cursor)) {
|
||||
/*
|
||||
* New radix tree nodes must be added under @p->lock because the
|
||||
* tree root is in a darray that can be resized (typically,
|
||||
* genradix supports concurrent unlocked allocation of new
|
||||
* nodes) - hence preallocation and the retry loop:
|
||||
*/
|
||||
objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
|
||||
objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
|
||||
if (unlikely(!objs->cursor)) {
|
||||
if (may_sleep) {
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
|
||||
gfp_t gfp = GFP_KERNEL;
|
||||
if (!head)
|
||||
gfp |= __GFP_NOFAIL;
|
||||
|
||||
new_node = genradix_alloc_node(gfp);
|
||||
if (!new_node)
|
||||
may_sleep = false;
|
||||
goto check_expired;
|
||||
}
|
||||
list_add:
|
||||
start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
|
||||
goto start_gp;
|
||||
}
|
||||
}
|
||||
|
||||
*objs->cursor++ = ptr ?: head;
|
||||
/* zero cursor if we hit the end of a radix tree node: */
|
||||
if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
|
||||
objs->cursor = NULL;
|
||||
start_gp = !objs->nr;
|
||||
objs->nr++;
|
||||
start_gp:
|
||||
if (unlikely(start_gp)) {
|
||||
/*
|
||||
* We only have one callback (ideally, we would have one for
|
||||
* every outstanding graceperiod) - so if our callback is
|
||||
* already in flight, we may still have to start a grace period
|
||||
* (since we used get_state() above, not start_poll())
|
||||
*/
|
||||
if (!p->cb_armed) {
|
||||
p->cb_armed = true;
|
||||
__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
|
||||
} else {
|
||||
__start_poll_synchronize_rcu(pending->srcu);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&p->lock, flags);
|
||||
free_node:
|
||||
if (new_node)
|
||||
genradix_free_node(new_node);
|
||||
return;
|
||||
check_expired:
|
||||
if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
|
||||
switch ((ulong) pending->process) {
|
||||
case RCU_PENDING_KVFREE:
|
||||
kvfree(ptr);
|
||||
break;
|
||||
case RCU_PENDING_CALL_RCU:
|
||||
head->func(head);
|
||||
break;
|
||||
default:
|
||||
pending->process(pending, head);
|
||||
break;
|
||||
}
|
||||
goto free_node;
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
p = this_cpu_ptr(pending->p);
|
||||
spin_lock(&p->lock);
|
||||
goto restart;
|
||||
}
|
||||
|
||||
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
|
||||
{
|
||||
__rcu_pending_enqueue(pending, obj, NULL, true);
|
||||
}
|
||||
|
||||
static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
|
||||
{
|
||||
struct rcu_head *ret = NULL;
|
||||
|
||||
spin_lock_irq(&p->lock);
|
||||
darray_for_each(p->objs, objs)
|
||||
if (objs->nr) {
|
||||
ret = *genradix_ptr(&objs->objs, --objs->nr);
|
||||
objs->cursor = NULL;
|
||||
if (!objs->nr)
|
||||
genradix_free(&objs->objs);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static_array_for_each(p->lists, i)
|
||||
if (i->head) {
|
||||
ret = i->head;
|
||||
#ifdef __KERNEL__
|
||||
i->head = ret->next;
|
||||
#else
|
||||
i->head = (void *) ret->next.next;
|
||||
#endif
|
||||
if (!i->head)
|
||||
i->tail = NULL;
|
||||
goto out;
|
||||
}
|
||||
out:
|
||||
spin_unlock_irq(&p->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
|
||||
{
|
||||
return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
|
||||
}
|
||||
|
||||
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
|
||||
{
|
||||
struct rcu_head *ret = rcu_pending_dequeue(pending);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
|
||||
{
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
spin_lock_irq(&p->lock);
|
||||
if (__rcu_pending_has_pending(p) || p->cb_armed) {
|
||||
spin_unlock_irq(&p->lock);
|
||||
return true;
|
||||
}
|
||||
spin_unlock_irq(&p->lock);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void rcu_pending_exit(struct rcu_pending *pending)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (!pending->p)
|
||||
return;
|
||||
|
||||
while (rcu_pending_has_pending_or_armed(pending)) {
|
||||
__rcu_barrier(pending->srcu);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
flush_work(&p->work);
|
||||
}
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
flush_work(&p->work);
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
|
||||
static_array_for_each(p->lists, i)
|
||||
WARN_ON(i->head);
|
||||
WARN_ON(p->objs.nr);
|
||||
darray_exit(&p->objs);
|
||||
}
|
||||
free_percpu(pending->p);
|
||||
}
|
||||
|
||||
/**
|
||||
* rcu_pending_init: - initialize a rcu_pending
|
||||
*
|
||||
* @pending: Object to init
|
||||
* @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
|
||||
* RCU flavor
|
||||
* @process: Callback function invoked on objects once their RCU barriers
|
||||
* have completed; if NULL, kvfree() is used.
|
||||
*/
|
||||
int rcu_pending_init(struct rcu_pending *pending,
|
||||
struct srcu_struct *srcu,
|
||||
rcu_pending_process_fn process)
|
||||
{
|
||||
pending->p = alloc_percpu(struct rcu_pending_pcpu);
|
||||
if (!pending->p)
|
||||
return -ENOMEM;
|
||||
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
|
||||
p->parent = pending;
|
||||
p->cpu = cpu;
|
||||
spin_lock_init(&p->lock);
|
||||
darray_init(&p->objs);
|
||||
INIT_WORK(&p->work, rcu_pending_work);
|
||||
}
|
||||
|
||||
pending->srcu = srcu;
|
||||
pending->process = process;
|
||||
|
||||
return 0;
|
||||
}
|
27
fs/bcachefs/rcu_pending.h
Normal file
27
fs/bcachefs/rcu_pending.h
Normal file
@ -0,0 +1,27 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_RCU_PENDING_H
|
||||
#define _LINUX_RCU_PENDING_H
|
||||
|
||||
#include <linux/rcupdate.h>
|
||||
|
||||
struct rcu_pending;
|
||||
typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *);
|
||||
|
||||
struct rcu_pending_pcpu;
|
||||
|
||||
struct rcu_pending {
|
||||
struct rcu_pending_pcpu __percpu *p;
|
||||
struct srcu_struct *srcu;
|
||||
rcu_pending_process_fn process;
|
||||
};
|
||||
|
||||
void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj);
|
||||
struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending);
|
||||
struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending);
|
||||
|
||||
void rcu_pending_exit(struct rcu_pending *pending);
|
||||
int rcu_pending_init(struct rcu_pending *pending,
|
||||
struct srcu_struct *srcu,
|
||||
rcu_pending_process_fn process);
|
||||
|
||||
#endif /* _LINUX_RCU_PENDING_H */
|
@ -122,7 +122,7 @@ static void extent_to_replicas(struct bkey_s_c k,
|
||||
continue;
|
||||
|
||||
if (!p.has_ec)
|
||||
r->devs[r->nr_devs++] = p.ptr.dev;
|
||||
replicas_entry_add_dev(r, p.ptr.dev);
|
||||
else
|
||||
r->nr_required = 0;
|
||||
}
|
||||
@ -139,7 +139,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
|
||||
for (ptr = s.v->ptrs;
|
||||
ptr < s.v->ptrs + s.v->nr_blocks;
|
||||
ptr++)
|
||||
r->devs[r->nr_devs++] = ptr->dev;
|
||||
replicas_entry_add_dev(r, ptr->dev);
|
||||
}
|
||||
|
||||
void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
|
||||
@ -180,7 +180,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
|
||||
e->nr_required = 1;
|
||||
|
||||
darray_for_each(devs, i)
|
||||
e->devs[e->nr_devs++] = *i;
|
||||
replicas_entry_add_dev(e, *i);
|
||||
|
||||
bch2_replicas_entry_sort(e);
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
struct bch_replicas_entry_v0 {
|
||||
__u8 data_type;
|
||||
__u8 nr_devs;
|
||||
__u8 devs[];
|
||||
__u8 devs[] __counted_by(nr_devs);
|
||||
} __packed;
|
||||
|
||||
struct bch_sb_field_replicas_v0 {
|
||||
@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 {
|
||||
__u8 data_type;
|
||||
__u8 nr_devs;
|
||||
__u8 nr_required;
|
||||
__u8 devs[];
|
||||
__u8 devs[] __counted_by(nr_devs);
|
||||
} __packed;
|
||||
|
||||
struct bch_sb_field_replicas {
|
||||
@ -28,4 +28,9 @@ struct bch_sb_field_replicas {
|
||||
#define replicas_entry_bytes(_i) \
|
||||
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
|
||||
|
||||
#define replicas_entry_add_dev(e, d) ({ \
|
||||
(e)->nr_devs++; \
|
||||
(e)->devs[(e)->nr_devs - 1] = (d); \
|
||||
})
|
||||
|
||||
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
|
||||
|
@ -23,7 +23,7 @@ enum bch_fsck_flags {
|
||||
x(jset_past_bucket_end, 9, 0) \
|
||||
x(jset_seq_blacklisted, 10, 0) \
|
||||
x(journal_entries_missing, 11, 0) \
|
||||
x(journal_entry_replicas_not_marked, 12, 0) \
|
||||
x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
|
||||
x(journal_entry_past_jset_end, 13, 0) \
|
||||
x(journal_entry_replicas_data_mismatch, 14, 0) \
|
||||
x(journal_entry_bkey_u64s_0, 15, 0) \
|
||||
|
@ -464,3 +464,12 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
|
||||
__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
|
||||
}
|
||||
}
|
||||
|
||||
unsigned bch2_sb_nr_devices(const struct bch_sb *sb)
|
||||
{
|
||||
unsigned nr = 0;
|
||||
|
||||
for (unsigned i = 0; i < sb->nr_devices; i++)
|
||||
nr += bch2_member_exists((struct bch_sb *) sb, i);
|
||||
return nr;
|
||||
}
|
||||
|
@ -307,6 +307,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned bch2_sb_nr_devices(const struct bch_sb *);
|
||||
|
||||
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
|
||||
{
|
||||
return (struct bch_member_cpu) {
|
||||
|
@ -31,6 +31,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
|
||||
int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
|
||||
int bch2_subvol_is_ro(struct bch_fs *, u32);
|
||||
|
||||
static inline struct bkey_s_c
|
||||
bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end,
|
||||
u32 subvolid, unsigned flags)
|
||||
{
|
||||
u32 snapshot;
|
||||
int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
|
||||
if (ret)
|
||||
return bkey_s_c_err(ret);
|
||||
|
||||
bch2_btree_iter_set_snapshot(iter, snapshot);
|
||||
return bch2_btree_iter_peek_upto_type(iter, end, flags);
|
||||
}
|
||||
|
||||
#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
|
||||
_end, _subvolid, _flags, _k, _do) \
|
||||
({ \
|
||||
struct bkey_s_c _k; \
|
||||
int _ret3 = 0; \
|
||||
\
|
||||
do { \
|
||||
_ret3 = lockrestart_do(_trans, ({ \
|
||||
(_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \
|
||||
_end, _subvolid, (_flags)); \
|
||||
if (!(_k).k) \
|
||||
break; \
|
||||
\
|
||||
bkey_err(_k) ?: (_do); \
|
||||
})); \
|
||||
} while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
|
||||
\
|
||||
bch2_trans_iter_exit((_trans), &(_iter)); \
|
||||
_ret3; \
|
||||
})
|
||||
|
||||
#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \
|
||||
_start, _end, _subvolid, _flags, _k, _do) \
|
||||
({ \
|
||||
struct btree_iter _iter; \
|
||||
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
||||
(_start), (_flags)); \
|
||||
\
|
||||
for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \
|
||||
_end, _subvolid, _flags, _k, _do); \
|
||||
})
|
||||
|
||||
int bch2_delete_dead_snapshots(struct bch_fs *);
|
||||
void bch2_delete_dead_snapshots_async(struct bch_fs *);
|
||||
|
||||
|
@ -30,7 +30,8 @@ struct snapshot_table {
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
u32 subvol;
|
||||
/* we can't have padding in this struct: */
|
||||
u64 subvol;
|
||||
u64 inum;
|
||||
} subvol_inum;
|
||||
|
||||
|
@ -418,6 +418,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
|
||||
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
|
||||
!BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
|
||||
SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
|
||||
|
||||
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
|
||||
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
|
||||
}
|
||||
|
||||
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
|
||||
@ -1292,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
|
||||
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
bool print_layout, unsigned fields)
|
||||
{
|
||||
u64 fields_have = 0;
|
||||
unsigned nr_devices = 0;
|
||||
|
||||
if (!out->nr_tabstops)
|
||||
printbuf_tabstop_push(out, 44);
|
||||
|
||||
for (int i = 0; i < sb->nr_devices; i++)
|
||||
nr_devices += bch2_member_exists(sb, i);
|
||||
|
||||
prt_printf(out, "External UUID:\t");
|
||||
pr_uuid(out, sb->user_uuid.b);
|
||||
prt_newline(out);
|
||||
@ -1356,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb));
|
||||
prt_printf(out, "Devices:\t%u\n", nr_devices);
|
||||
prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb));
|
||||
|
||||
prt_printf(out, "Sections:\t");
|
||||
u64 fields_have = 0;
|
||||
vstruct_for_each(sb, f)
|
||||
fields_have |= 1 << le32_to_cpu(f->type);
|
||||
prt_bitflags(out, bch2_sb_fields, fields_have);
|
||||
|
@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c)
|
||||
bch2_fs_fs_io_direct_exit(c);
|
||||
bch2_fs_fs_io_buffered_exit(c);
|
||||
bch2_fs_fsio_exit(c);
|
||||
bch2_fs_vfs_exit(c);
|
||||
bch2_fs_ec_exit(c);
|
||||
bch2_fs_encryption_exit(c);
|
||||
bch2_fs_nocow_locking_exit(c);
|
||||
@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
|
||||
c->copy_gc_enabled = 1;
|
||||
c->rebalance.enabled = 1;
|
||||
c->promote_whole_extents = true;
|
||||
|
||||
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
|
||||
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
|
||||
@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
||||
bch2_fs_encryption_init(c) ?:
|
||||
bch2_fs_compress_init(c) ?:
|
||||
bch2_fs_ec_init(c) ?:
|
||||
bch2_fs_vfs_init(c) ?:
|
||||
bch2_fs_fsio_init(c) ?:
|
||||
bch2_fs_fs_io_buffered_init(c) ?:
|
||||
bch2_fs_fs_io_direct_init(c);
|
||||
|
@ -219,7 +219,6 @@ read_attribute(copy_gc_wait);
|
||||
rw_attribute(rebalance_enabled);
|
||||
sysfs_pd_controller_attribute(rebalance);
|
||||
read_attribute(rebalance_status);
|
||||
rw_attribute(promote_whole_extents);
|
||||
|
||||
read_attribute(new_stripes);
|
||||
|
||||
@ -234,7 +233,7 @@ write_attribute(perf_test);
|
||||
|
||||
#define x(_name) \
|
||||
static struct attribute sysfs_time_stat_##_name = \
|
||||
{ .name = #_name, .mode = 0444 };
|
||||
{ .name = #_name, .mode = 0644 };
|
||||
BCH_TIME_STATS()
|
||||
#undef x
|
||||
|
||||
@ -347,8 +346,6 @@ SHOW(bch2_fs)
|
||||
if (attr == &sysfs_rebalance_status)
|
||||
bch2_rebalance_status_to_text(out, c);
|
||||
|
||||
sysfs_print(promote_whole_extents, c->promote_whole_extents);
|
||||
|
||||
/* Debugging: */
|
||||
|
||||
if (attr == &sysfs_journal_debug)
|
||||
@ -436,8 +433,6 @@ STORE(bch2_fs)
|
||||
|
||||
sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
|
||||
|
||||
sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
|
||||
|
||||
/* Debugging: */
|
||||
|
||||
if (!test_bit(BCH_FS_started, &c->flags))
|
||||
@ -514,7 +509,7 @@ struct attribute *bch2_fs_files[] = {
|
||||
&sysfs_btree_cache_size,
|
||||
&sysfs_btree_write_stats,
|
||||
|
||||
&sysfs_promote_whole_extents,
|
||||
&sysfs_rebalance_status,
|
||||
|
||||
&sysfs_compression_stats,
|
||||
|
||||
@ -614,7 +609,6 @@ struct attribute *bch2_fs_internal_files[] = {
|
||||
&sysfs_copy_gc_wait,
|
||||
|
||||
&sysfs_rebalance_enabled,
|
||||
&sysfs_rebalance_status,
|
||||
sysfs_pd_controller_files(rebalance),
|
||||
|
||||
&sysfs_moving_ctxts,
|
||||
@ -674,7 +668,7 @@ STORE(bch2_fs_opts_dir)
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
bch2_opt_set_sb(c, opt, v);
|
||||
bch2_opt_set_sb(c, NULL, opt, v);
|
||||
bch2_opt_set_by_id(&c->opts, id, v);
|
||||
|
||||
if (v &&
|
||||
@ -728,6 +722,13 @@ SHOW(bch2_fs_time_stats)
|
||||
|
||||
STORE(bch2_fs_time_stats)
|
||||
{
|
||||
struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
|
||||
|
||||
#define x(name) \
|
||||
if (attr == &sysfs_time_stat_##name) \
|
||||
bch2_time_stats_reset(&c->times[BCH_TIME_##name]);
|
||||
BCH_TIME_STATS()
|
||||
#undef x
|
||||
return size;
|
||||
}
|
||||
SYSFS_OPS(bch2_fs_time_stats);
|
||||
@ -821,32 +822,17 @@ STORE(bch2_dev)
|
||||
{
|
||||
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct bch_member *mi;
|
||||
|
||||
if (attr == &sysfs_discard) {
|
||||
bool v = strtoul_or_return(buf);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
|
||||
|
||||
if (v != BCH_MEMBER_DISCARD(mi)) {
|
||||
SET_BCH_MEMBER_DISCARD(mi, v);
|
||||
bch2_write_super(c);
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_durability) {
|
||||
u64 v = strtoul_or_return(buf);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
|
||||
|
||||
if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
|
||||
SET_BCH_MEMBER_DURABILITY(mi, v + 1);
|
||||
bch2_write_super(c);
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
|
||||
}
|
||||
|
||||
if (attr == &sysfs_label) {
|
||||
|
@ -387,7 +387,7 @@ again:
|
||||
seen = buf->buf.nr;
|
||||
char *n = memchr(buf->buf.data, '\n', seen);
|
||||
|
||||
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) {
|
||||
if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) {
|
||||
spin_unlock(&buf->lock);
|
||||
return -ETIME;
|
||||
}
|
||||
|
@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_time_stats_reset(struct bch2_time_stats *stats)
|
||||
{
|
||||
spin_lock_irq(&stats->lock);
|
||||
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
|
||||
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
|
||||
|
||||
if (stats->buffer) {
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
|
||||
}
|
||||
spin_unlock_irq(&stats->lock);
|
||||
}
|
||||
|
||||
void bch2_time_stats_exit(struct bch2_time_stats *stats)
|
||||
{
|
||||
free_percpu(stats->buffer);
|
||||
|
@ -70,6 +70,7 @@ struct time_stat_buffer {
|
||||
struct bch2_time_stats {
|
||||
spinlock_t lock;
|
||||
bool have_quantiles;
|
||||
struct time_stat_buffer __percpu *buffer;
|
||||
/* all fields are in nanoseconds */
|
||||
u64 min_duration;
|
||||
u64 max_duration;
|
||||
@ -87,7 +88,6 @@ struct bch2_time_stats {
|
||||
|
||||
struct mean_and_variance_weighted duration_stats_weighted;
|
||||
struct mean_and_variance_weighted freq_stats_weighted;
|
||||
struct time_stat_buffer __percpu *buffer;
|
||||
};
|
||||
|
||||
struct bch2_time_stats_quantiles {
|
||||
@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_time_stats_reset(struct bch2_time_stats *);
|
||||
void bch2_time_stats_exit(struct bch2_time_stats *);
|
||||
void bch2_time_stats_init(struct bch2_time_stats *);
|
||||
|
||||
|
@ -3,7 +3,6 @@
|
||||
#define TRACE_SYSTEM bcachefs
|
||||
|
||||
#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_BCACHEFS_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
@ -558,6 +557,7 @@ TRACE_EVENT(btree_path_relock_fail,
|
||||
__field(unsigned long, caller_ip )
|
||||
__field(u8, btree_id )
|
||||
__field(u8, level )
|
||||
__field(u8, path_idx)
|
||||
TRACE_BPOS_entries(pos)
|
||||
__array(char, node, 24 )
|
||||
__field(u8, self_read_count )
|
||||
@ -575,7 +575,8 @@ TRACE_EVENT(btree_path_relock_fail,
|
||||
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
||||
__entry->caller_ip = caller_ip;
|
||||
__entry->btree_id = path->btree_id;
|
||||
__entry->level = path->level;
|
||||
__entry->level = level;
|
||||
__entry->path_idx = path - trans->paths;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
|
||||
c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
|
||||
@ -588,7 +589,7 @@ TRACE_EVENT(btree_path_relock_fail,
|
||||
c = six_lock_counts(&path->l[level].b->c.lock);
|
||||
__entry->read_count = c.n[SIX_LOCK_read];
|
||||
__entry->intent_count = c.n[SIX_LOCK_intent];
|
||||
scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
|
||||
scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c);
|
||||
}
|
||||
__entry->iter_lock_seq = path->l[level].lock_seq;
|
||||
__entry->node_lock_seq = is_btree_node(path, level)
|
||||
@ -596,9 +597,10 @@ TRACE_EVENT(btree_path_relock_fail,
|
||||
: 0;
|
||||
),
|
||||
|
||||
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
|
||||
TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
|
||||
__entry->trans_fn,
|
||||
(void *) __entry->caller_ip,
|
||||
__entry->path_idx,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
@ -625,6 +627,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
|
||||
__field(unsigned long, caller_ip )
|
||||
__field(u8, btree_id )
|
||||
__field(u8, level )
|
||||
__field(u8, path_idx)
|
||||
TRACE_BPOS_entries(pos)
|
||||
__field(u8, locked )
|
||||
__field(u8, self_read_count )
|
||||
@ -642,6 +645,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
|
||||
__entry->caller_ip = caller_ip;
|
||||
__entry->btree_id = path->btree_id;
|
||||
__entry->level = level;
|
||||
__entry->path_idx = path - trans->paths;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
__entry->locked = btree_node_locked(path, level);
|
||||
|
||||
@ -657,9 +661,10 @@ TRACE_EVENT(btree_path_upgrade_fail,
|
||||
: 0;
|
||||
),
|
||||
|
||||
TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
|
||||
TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
|
||||
__entry->trans_fn,
|
||||
(void *) __entry->caller_ip,
|
||||
__entry->path_idx,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
@ -1438,6 +1443,456 @@ TRACE_EVENT(error_downcast,
|
||||
TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
|
||||
);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
|
||||
|
||||
TRACE_EVENT(update_by_path,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path,
|
||||
struct btree_insert_entry *i, bool overwrite),
|
||||
TP_ARGS(trans, path, i, overwrite),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, trans_fn, 32 )
|
||||
__field(btree_path_idx_t, path_idx )
|
||||
__field(u8, btree_id )
|
||||
TRACE_BPOS_entries(pos)
|
||||
__field(u8, overwrite )
|
||||
__field(btree_path_idx_t, update_idx )
|
||||
__field(btree_path_idx_t, nr_updates )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
||||
__entry->path_idx = path - trans->paths;
|
||||
__entry->btree_id = path->btree_id;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
__entry->overwrite = overwrite;
|
||||
__entry->update_idx = i - trans->updates;
|
||||
__entry->nr_updates = trans->nr_updates;
|
||||
),
|
||||
|
||||
TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u",
|
||||
__entry->trans_fn,
|
||||
__entry->path_idx,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
__entry->pos_snapshot,
|
||||
__entry->overwrite,
|
||||
__entry->update_idx,
|
||||
__entry->nr_updates)
|
||||
);
|
||||
|
||||
TRACE_EVENT(btree_path_lock,
|
||||
TP_PROTO(struct btree_trans *trans,
|
||||
unsigned long caller_ip,
|
||||
struct btree_bkey_cached_common *b),
|
||||
TP_ARGS(trans, caller_ip, b),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, trans_fn, 32 )
|
||||
__field(unsigned long, caller_ip )
|
||||
__field(u8, btree_id )
|
||||
__field(u8, level )
|
||||
__array(char, node, 24 )
|
||||
__field(u32, lock_seq )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
||||
__entry->caller_ip = caller_ip;
|
||||
__entry->btree_id = b->btree_id;
|
||||
__entry->level = b->level;
|
||||
|
||||
scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
|
||||
__entry->lock_seq = six_lock_seq(&b->lock);
|
||||
),
|
||||
|
||||
TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u",
|
||||
__entry->trans_fn,
|
||||
(void *) __entry->caller_ip,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->level,
|
||||
__entry->node,
|
||||
__entry->lock_seq)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(btree_path_ev,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
|
||||
TP_ARGS(trans, path),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(u16, idx )
|
||||
__field(u8, ref )
|
||||
__field(u8, btree_id )
|
||||
TRACE_BPOS_entries(pos)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path - trans->paths;
|
||||
__entry->ref = path->ref;
|
||||
__entry->btree_id = path->btree_id;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
),
|
||||
|
||||
TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u",
|
||||
__entry->idx, __entry->ref,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
__entry->pos_snapshot)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_ev, btree_path_get_ll,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
|
||||
TP_ARGS(trans, path)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_ev, btree_path_put_ll,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
|
||||
TP_ARGS(trans, path)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
|
||||
TP_ARGS(trans, path)
|
||||
);
|
||||
|
||||
TRACE_EVENT(btree_path_alloc,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
|
||||
TP_ARGS(trans, path),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(btree_path_idx_t, idx )
|
||||
__field(u8, locks_want )
|
||||
__field(u8, btree_id )
|
||||
TRACE_BPOS_entries(pos)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path - trans->paths;
|
||||
__entry->locks_want = path->locks_want;
|
||||
__entry->btree_id = path->btree_id;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
),
|
||||
|
||||
TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u",
|
||||
__entry->idx,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->locks_want,
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
__entry->pos_snapshot)
|
||||
);
|
||||
|
||||
TRACE_EVENT(btree_path_get,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos),
|
||||
TP_ARGS(trans, path, new_pos),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(btree_path_idx_t, idx )
|
||||
__field(u8, ref )
|
||||
__field(u8, preserve )
|
||||
__field(u8, locks_want )
|
||||
__field(u8, btree_id )
|
||||
TRACE_BPOS_entries(old_pos)
|
||||
TRACE_BPOS_entries(new_pos)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path - trans->paths;
|
||||
__entry->ref = path->ref;
|
||||
__entry->preserve = path->preserve;
|
||||
__entry->locks_want = path->locks_want;
|
||||
__entry->btree_id = path->btree_id;
|
||||
TRACE_BPOS_assign(old_pos, path->pos);
|
||||
TRACE_BPOS_assign(new_pos, *new_pos);
|
||||
),
|
||||
|
||||
TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u",
|
||||
__entry->idx,
|
||||
__entry->ref,
|
||||
__entry->preserve,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->locks_want,
|
||||
__entry->old_pos_inode,
|
||||
__entry->old_pos_offset,
|
||||
__entry->old_pos_snapshot,
|
||||
__entry->new_pos_inode,
|
||||
__entry->new_pos_offset,
|
||||
__entry->new_pos_snapshot)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(btree_path_clone,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
|
||||
TP_ARGS(trans, path, new),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(btree_path_idx_t, idx )
|
||||
__field(u8, new_idx )
|
||||
__field(u8, btree_id )
|
||||
__field(u8, ref )
|
||||
__field(u8, preserve )
|
||||
TRACE_BPOS_entries(pos)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path - trans->paths;
|
||||
__entry->new_idx = new - trans->paths;
|
||||
__entry->btree_id = path->btree_id;
|
||||
__entry->ref = path->ref;
|
||||
__entry->preserve = path->preserve;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
),
|
||||
|
||||
TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u",
|
||||
__entry->idx,
|
||||
__entry->ref,
|
||||
__entry->preserve,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
__entry->pos_snapshot,
|
||||
__entry->new_idx)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_clone, btree_path_clone,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
|
||||
TP_ARGS(trans, path, new)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_clone, btree_path_save_pos,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new),
|
||||
TP_ARGS(trans, path, new)
|
||||
);
|
||||
|
||||
DECLARE_EVENT_CLASS(btree_path_traverse,
|
||||
TP_PROTO(struct btree_trans *trans,
|
||||
struct btree_path *path),
|
||||
TP_ARGS(trans, path),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, trans_fn, 32 )
|
||||
__field(btree_path_idx_t, idx )
|
||||
__field(u8, ref )
|
||||
__field(u8, preserve )
|
||||
__field(u8, should_be_locked )
|
||||
__field(u8, btree_id )
|
||||
__field(u8, level )
|
||||
TRACE_BPOS_entries(pos)
|
||||
__field(u8, locks_want )
|
||||
__field(u8, nodes_locked )
|
||||
__array(char, node0, 24 )
|
||||
__array(char, node1, 24 )
|
||||
__array(char, node2, 24 )
|
||||
__array(char, node3, 24 )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
|
||||
|
||||
__entry->idx = path - trans->paths;
|
||||
__entry->ref = path->ref;
|
||||
__entry->preserve = path->preserve;
|
||||
__entry->btree_id = path->btree_id;
|
||||
__entry->level = path->level;
|
||||
TRACE_BPOS_assign(pos, path->pos);
|
||||
|
||||
__entry->locks_want = path->locks_want;
|
||||
__entry->nodes_locked = path->nodes_locked;
|
||||
struct btree *b = path->l[0].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
|
||||
b = path->l[1].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
|
||||
b = path->l[2].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
|
||||
b = path->l[3].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
|
||||
),
|
||||
|
||||
TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n"
|
||||
"locks %u %u %u %u node %s %s %s %s",
|
||||
__entry->trans_fn,
|
||||
__entry->idx,
|
||||
__entry->ref,
|
||||
__entry->preserve,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->pos_inode,
|
||||
__entry->pos_offset,
|
||||
__entry->pos_snapshot,
|
||||
__entry->level,
|
||||
__entry->locks_want,
|
||||
(__entry->nodes_locked >> 6) & 3,
|
||||
(__entry->nodes_locked >> 4) & 3,
|
||||
(__entry->nodes_locked >> 2) & 3,
|
||||
(__entry->nodes_locked >> 0) & 3,
|
||||
__entry->node3,
|
||||
__entry->node2,
|
||||
__entry->node1,
|
||||
__entry->node0)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start,
|
||||
TP_PROTO(struct btree_trans *trans,
|
||||
struct btree_path *path),
|
||||
TP_ARGS(trans, path)
|
||||
);
|
||||
|
||||
DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end,
|
||||
TP_PROTO(struct btree_trans *trans, struct btree_path *path),
|
||||
TP_ARGS(trans, path)
|
||||
);
|
||||
|
||||
TRACE_EVENT(btree_path_set_pos,
|
||||
TP_PROTO(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
struct bpos *new_pos),
|
||||
TP_ARGS(trans, path, new_pos),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(btree_path_idx_t, idx )
|
||||
__field(u8, ref )
|
||||
__field(u8, preserve )
|
||||
__field(u8, btree_id )
|
||||
TRACE_BPOS_entries(old_pos)
|
||||
TRACE_BPOS_entries(new_pos)
|
||||
__field(u8, locks_want )
|
||||
__field(u8, nodes_locked )
|
||||
__array(char, node0, 24 )
|
||||
__array(char, node1, 24 )
|
||||
__array(char, node2, 24 )
|
||||
__array(char, node3, 24 )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path - trans->paths;
|
||||
__entry->ref = path->ref;
|
||||
__entry->preserve = path->preserve;
|
||||
__entry->btree_id = path->btree_id;
|
||||
TRACE_BPOS_assign(old_pos, path->pos);
|
||||
TRACE_BPOS_assign(new_pos, *new_pos);
|
||||
|
||||
__entry->nodes_locked = path->nodes_locked;
|
||||
struct btree *b = path->l[0].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c);
|
||||
b = path->l[1].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c);
|
||||
b = path->l[2].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c);
|
||||
b = path->l[3].b;
|
||||
if (IS_ERR(b))
|
||||
strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0));
|
||||
else
|
||||
scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c);
|
||||
),
|
||||
|
||||
TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n"
|
||||
"locks %u %u %u %u node %s %s %s %s",
|
||||
__entry->idx,
|
||||
__entry->ref,
|
||||
__entry->preserve,
|
||||
bch2_btree_id_str(__entry->btree_id),
|
||||
__entry->old_pos_inode,
|
||||
__entry->old_pos_offset,
|
||||
__entry->old_pos_snapshot,
|
||||
__entry->new_pos_inode,
|
||||
__entry->new_pos_offset,
|
||||
__entry->new_pos_snapshot,
|
||||
(__entry->nodes_locked >> 6) & 3,
|
||||
(__entry->nodes_locked >> 4) & 3,
|
||||
(__entry->nodes_locked >> 2) & 3,
|
||||
(__entry->nodes_locked >> 0) & 3,
|
||||
__entry->node3,
|
||||
__entry->node2,
|
||||
__entry->node1,
|
||||
__entry->node0)
|
||||
);
|
||||
|
||||
TRACE_EVENT(btree_path_free,
|
||||
TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup),
|
||||
TP_ARGS(trans, path, dup),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(btree_path_idx_t, idx )
|
||||
__field(u8, preserve )
|
||||
__field(u8, should_be_locked)
|
||||
__field(s8, dup )
|
||||
__field(u8, dup_locked )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path;
|
||||
__entry->preserve = trans->paths[path].preserve;
|
||||
__entry->should_be_locked = trans->paths[path].should_be_locked;
|
||||
__entry->dup = dup ? dup - trans->paths : -1;
|
||||
__entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0;
|
||||
),
|
||||
|
||||
TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx,
|
||||
__entry->preserve ? 'P' : ' ',
|
||||
__entry->should_be_locked ? 'S' : ' ',
|
||||
__entry->dup,
|
||||
__entry->dup_locked)
|
||||
);
|
||||
|
||||
TRACE_EVENT(btree_path_free_trans_begin,
|
||||
TP_PROTO(btree_path_idx_t path),
|
||||
TP_ARGS(path),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(btree_path_idx_t, idx )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->idx = path;
|
||||
),
|
||||
|
||||
TP_printk(" path %3u", __entry->idx)
|
||||
);
|
||||
|
||||
#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
|
||||
#ifndef _TRACE_BCACHEFS_H
|
||||
|
||||
static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path,
|
||||
struct btree_insert_entry *i, bool overwrite) {}
|
||||
static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {}
|
||||
static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {}
|
||||
static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {}
|
||||
static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {}
|
||||
static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {}
|
||||
static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
|
||||
static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
|
||||
static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {}
|
||||
static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {}
|
||||
static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
|
||||
static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
|
||||
static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
|
||||
static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
|
||||
|
||||
#endif
|
||||
#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
|
||||
|
||||
#define _TRACE_BCACHEFS_H
|
||||
#endif /* _TRACE_BCACHEFS_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
@ -204,7 +204,7 @@ STRTO_H(strtoll, long long)
|
||||
STRTO_H(strtoull, unsigned long long)
|
||||
STRTO_H(strtou64, u64)
|
||||
|
||||
u64 bch2_read_flag_list(char *opt, const char * const list[])
|
||||
u64 bch2_read_flag_list(const char *opt, const char * const list[])
|
||||
{
|
||||
u64 ret = 0;
|
||||
char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
|
||||
|
@ -195,7 +195,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res)
|
||||
|
||||
bool bch2_is_zero(const void *, size_t);
|
||||
|
||||
u64 bch2_read_flag_list(char *, const char * const[]);
|
||||
u64 bch2_read_flag_list(const char *, const char * const[]);
|
||||
|
||||
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
|
||||
void bch2_prt_u64_base2(struct printbuf *, u64);
|
||||
|
@ -250,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry)
|
||||
{
|
||||
const struct xattr_handler *handler = bch2_xattr_type_to_handler(type);
|
||||
|
||||
if (!xattr_handler_can_list(handler, dentry))
|
||||
return NULL;
|
||||
|
||||
return xattr_prefix(handler);
|
||||
}
|
||||
|
||||
static int bch2_xattr_emit(struct dentry *dentry,
|
||||
const struct bch_xattr *xattr,
|
||||
struct xattr_buf *buf)
|
||||
{
|
||||
const struct xattr_handler *handler =
|
||||
bch2_xattr_type_to_handler(xattr->x_type);
|
||||
const char *prefix;
|
||||
|
||||
return handler && (!handler->list || handler->list(dentry))
|
||||
? __bch2_xattr_emit(handler->prefix ?: handler->name,
|
||||
xattr->x_name, xattr->x_name_len, buf)
|
||||
: 0;
|
||||
prefix = bch2_xattr_prefix(xattr->x_type, dentry);
|
||||
if (!prefix)
|
||||
return 0;
|
||||
|
||||
return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
|
||||
}
|
||||
|
||||
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
|
||||
@ -295,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
|
||||
{
|
||||
struct bch_fs *c = dentry->d_sb->s_fs_info;
|
||||
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
|
||||
u64 offset = 0, inum = inode->ei_inode.bi_inum;
|
||||
u32 snapshot;
|
||||
int ret;
|
||||
retry:
|
||||
bch2_trans_begin(trans);
|
||||
iter = (struct btree_iter) { NULL };
|
||||
|
||||
ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
|
||||
SPOS(inum, offset, snapshot),
|
||||
POS(inum, U64_MAX), 0, k, ret) {
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs,
|
||||
POS(inum, offset),
|
||||
POS(inum, U64_MAX),
|
||||
inode->ei_inum.subvol, 0, k, ({
|
||||
if (k.k->type != KEY_TYPE_xattr)
|
||||
continue;
|
||||
|
||||
ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
|
||||
}))) ?:
|
||||
bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?:
|
||||
bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
|
||||
|
||||
offset = iter.pos.offset;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
bch2_trans_put(trans);
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
return buf.used;
|
||||
out:
|
||||
return bch2_err_class(ret);
|
||||
return ret ? bch2_err_class(ret) : buf.used;
|
||||
}
|
||||
|
||||
static int bch2_xattr_get_handler(const struct xattr_handler *handler,
|
||||
@ -632,10 +611,6 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
|
||||
|
||||
const struct xattr_handler *bch2_xattr_handlers[] = {
|
||||
&bch_xattr_user_handler,
|
||||
#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
||||
&nop_posix_acl_access,
|
||||
&nop_posix_acl_default,
|
||||
#endif
|
||||
&bch_xattr_trusted_handler,
|
||||
&bch_xattr_security_handler,
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
@ -13,7 +13,7 @@ struct bch_xattr {
|
||||
__u8 x_type;
|
||||
__u8 x_name_len;
|
||||
__le16 x_val_len;
|
||||
__u8 x_name[];
|
||||
__u8 x_name[] __counted_by(x_name_len);
|
||||
} __packed __aligned(8);
|
||||
|
||||
#endif /* _BCACHEFS_XATTR_FORMAT_H */
|
||||
|
@ -438,14 +438,6 @@ static void init_once(void *foo)
|
||||
inode_init_once(inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* inode->i_lock must be held
|
||||
*/
|
||||
void __iget(struct inode *inode)
|
||||
{
|
||||
atomic_inc(&inode->i_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* get additional reference to inode; caller must already hold one.
|
||||
*/
|
||||
|
@ -3100,7 +3100,14 @@ static inline bool is_zero_ino(ino_t ino)
|
||||
return (u32)ino == 0;
|
||||
}
|
||||
|
||||
extern void __iget(struct inode * inode);
|
||||
/*
|
||||
* inode->i_lock must be held
|
||||
*/
|
||||
static inline void __iget(struct inode *inode)
|
||||
{
|
||||
atomic_inc(&inode->i_count);
|
||||
}
|
||||
|
||||
extern void iget_failed(struct inode *);
|
||||
extern void clear_inode(struct inode *);
|
||||
extern void __destroy_inode(struct inode *);
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include <linux/limits.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/math.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
struct genradix_root;
|
||||
@ -48,10 +49,63 @@ struct genradix_root;
|
||||
#define GENRADIX_NODE_SHIFT 9
|
||||
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
|
||||
|
||||
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
|
||||
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
|
||||
|
||||
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
|
||||
#define GENRADIX_MAX_DEPTH \
|
||||
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
|
||||
|
||||
#define GENRADIX_DEPTH_MASK \
|
||||
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
|
||||
|
||||
static inline int genradix_depth_shift(unsigned depth)
|
||||
{
|
||||
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns size (of data, in bytes) that a tree of a given depth holds:
|
||||
*/
|
||||
static inline size_t genradix_depth_size(unsigned depth)
|
||||
{
|
||||
return 1UL << genradix_depth_shift(depth);
|
||||
}
|
||||
|
||||
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
|
||||
{
|
||||
return (unsigned long) r & GENRADIX_DEPTH_MASK;
|
||||
}
|
||||
|
||||
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
|
||||
{
|
||||
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
|
||||
}
|
||||
|
||||
struct __genradix {
|
||||
struct genradix_root *root;
|
||||
};
|
||||
|
||||
struct genradix_node {
|
||||
union {
|
||||
/* Interior node: */
|
||||
struct genradix_node *children[GENRADIX_ARY];
|
||||
|
||||
/* Leaf: */
|
||||
u8 data[GENRADIX_NODE_SIZE];
|
||||
};
|
||||
};
|
||||
|
||||
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
|
||||
{
|
||||
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
|
||||
}
|
||||
|
||||
static inline void genradix_free_node(struct genradix_node *node)
|
||||
{
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
|
||||
*/
|
||||
@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
|
||||
#define __genradix_idx_to_offset(_radix, _idx) \
|
||||
__idx_to_offset(_idx, __genradix_obj_size(_radix))
|
||||
|
||||
static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset)
|
||||
{
|
||||
struct genradix_root *r = READ_ONCE(radix->root);
|
||||
struct genradix_node *n = genradix_root_to_node(r);
|
||||
unsigned level = genradix_root_to_depth(r);
|
||||
unsigned shift = genradix_depth_shift(level);
|
||||
|
||||
if (unlikely(ilog2(offset) >= genradix_depth_shift(level)))
|
||||
return NULL;
|
||||
|
||||
while (n && shift > GENRADIX_NODE_SHIFT) {
|
||||
shift -= GENRADIX_ARY_SHIFT;
|
||||
n = n->children[offset >> shift];
|
||||
offset &= (1UL << shift) - 1;
|
||||
}
|
||||
|
||||
return n ? &n->data[offset] : NULL;
|
||||
}
|
||||
|
||||
#define genradix_ptr_inlined(_radix, _idx) \
|
||||
(__genradix_cast(_radix) \
|
||||
__genradix_ptr_inlined(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)))
|
||||
|
||||
void *__genradix_ptr(struct __genradix *, size_t);
|
||||
|
||||
/**
|
||||
@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t);
|
||||
__genradix_ptr(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)))
|
||||
|
||||
void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
|
||||
void *__genradix_ptr_alloc(struct __genradix *, size_t,
|
||||
struct genradix_node **, gfp_t);
|
||||
|
||||
#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \
|
||||
(__genradix_cast(_radix) \
|
||||
(__genradix_ptr_inlined(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)) ?: \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
NULL, _gfp)))
|
||||
|
||||
#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\
|
||||
(__genradix_cast(_radix) \
|
||||
(__genradix_ptr_inlined(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx)) ?: \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
_new_node, _gfp)))
|
||||
|
||||
/**
|
||||
* genradix_ptr_alloc - get a pointer to a genradix entry, allocating it
|
||||
@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t);
|
||||
(__genradix_cast(_radix) \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
_gfp))
|
||||
NULL, _gfp))
|
||||
|
||||
#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\
|
||||
(__genradix_cast(_radix) \
|
||||
__genradix_ptr_alloc(&(_radix)->tree, \
|
||||
__genradix_idx_to_offset(_radix, _idx), \
|
||||
_new_node, _gfp))
|
||||
|
||||
struct genradix_iter {
|
||||
size_t offset;
|
||||
|
@ -5,99 +5,31 @@
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kmemleak.h>
|
||||
|
||||
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
|
||||
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
|
||||
|
||||
struct genradix_node {
|
||||
union {
|
||||
/* Interior node: */
|
||||
struct genradix_node *children[GENRADIX_ARY];
|
||||
|
||||
/* Leaf: */
|
||||
u8 data[GENRADIX_NODE_SIZE];
|
||||
};
|
||||
};
|
||||
|
||||
static inline int genradix_depth_shift(unsigned depth)
|
||||
{
|
||||
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns size (of data, in bytes) that a tree of a given depth holds:
|
||||
*/
|
||||
static inline size_t genradix_depth_size(unsigned depth)
|
||||
{
|
||||
return 1UL << genradix_depth_shift(depth);
|
||||
}
|
||||
|
||||
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
|
||||
#define GENRADIX_MAX_DEPTH \
|
||||
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
|
||||
|
||||
#define GENRADIX_DEPTH_MASK \
|
||||
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
|
||||
|
||||
static inline unsigned genradix_root_to_depth(struct genradix_root *r)
|
||||
{
|
||||
return (unsigned long) r & GENRADIX_DEPTH_MASK;
|
||||
}
|
||||
|
||||
static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r)
|
||||
{
|
||||
return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns pointer to the specified byte @offset within @radix, or NULL if not
|
||||
* allocated
|
||||
*/
|
||||
void *__genradix_ptr(struct __genradix *radix, size_t offset)
|
||||
{
|
||||
struct genradix_root *r = READ_ONCE(radix->root);
|
||||
struct genradix_node *n = genradix_root_to_node(r);
|
||||
unsigned level = genradix_root_to_depth(r);
|
||||
|
||||
if (ilog2(offset) >= genradix_depth_shift(level))
|
||||
return NULL;
|
||||
|
||||
while (1) {
|
||||
if (!n)
|
||||
return NULL;
|
||||
if (!level)
|
||||
break;
|
||||
|
||||
level--;
|
||||
|
||||
n = n->children[offset >> genradix_depth_shift(level)];
|
||||
offset &= genradix_depth_size(level) - 1;
|
||||
}
|
||||
|
||||
return &n->data[offset];
|
||||
return __genradix_ptr_inlined(radix, offset);
|
||||
}
|
||||
EXPORT_SYMBOL(__genradix_ptr);
|
||||
|
||||
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
|
||||
{
|
||||
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
|
||||
}
|
||||
|
||||
static inline void genradix_free_node(struct genradix_node *node)
|
||||
{
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns pointer to the specified byte @offset within @radix, allocating it if
|
||||
* necessary - newly allocated slots are always zeroed out:
|
||||
*/
|
||||
void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset,
|
||||
struct genradix_node **preallocated,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct genradix_root *v = READ_ONCE(radix->root);
|
||||
struct genradix_node *n, *new_node = NULL;
|
||||
unsigned level;
|
||||
|
||||
if (preallocated)
|
||||
swap(new_node, *preallocated);
|
||||
|
||||
/* Increase tree depth if necessary: */
|
||||
while (1) {
|
||||
struct genradix_root *r = v, *new_root;
|
||||
@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
|
||||
size_t offset;
|
||||
|
||||
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
|
||||
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
|
||||
if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
|
Loading…
Reference in New Issue
Block a user