mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-13 14:24:11 +08:00
bcachefs fixes for 6.12-rc4
- New metadata version inode_has_child_snapshots This fixes bugs with handling of unlinked inodes + snapshots, in particular when an inode is reattached after taking a snapshot; deleted inodes now get correctly cleaned up across snapshots. - Disk accounting rewrite fixes - validation fixes for when a device has been removed - fix journal replay failing with "journal_reclaim_would_deadlock" - Some more small fixes for erasure coding + device removal - Assorted small syzbot fixes -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmcNw4UACgkQE6szbY3K bnbSzBAAmSCCQCqRwnFSp4OdNSlBK9q1e5WsbKOqHgtoXZU/mOUBe/5bnPPqm6Mg GkTc7FqVOs/95/rEDKXw2LneFgxRrt8MriJCUdXZvV5fC2R4Kdl0TkwABtMtm2Ae wp37n6iQO81j4uZHfOj67RzC2NRo7dMdun5HnQPRBTKzyuDaZXqwjMmF2LmaeODh oiBFUvD5nFBo5XvXPABBin6xpdquHO+6ZWf6SFD4+iRe11NrJAOAIS/crJvxsFfr I/X152Z+gzKPE+NhANKMxlHyNnVGo7iHUqhUjVuI4SSaXb9Ap6k4sXgfoIzncR17 GA5qWtaNS1W72+awT3R2EaF9Tqi+Vng2RVfxxQ04giImnBq0eziOjlZ26enOE0LU 0ZZrBFzqpItqYbNnzPissHuKb1mAQGPWy6kxoGIrqDKbichA7lzyWDz2lgEE85Sx E1mvHwYbKhUuLC4c4460hueGVUgMWmjqM3E8oex+oNDpauPB+/bnYkcgZEG2RBla +ZlDL28fg4fxtqlUrOQeonQ1RecGNdRMJz7xiGnkYU9rQpUuv8QwFiBZGAbLP6zn 6fbFZGxS/pO95sY7GmAtKz7ZgKxJQCzII4s+Oht5AgOvoBlPjAiol1UbwYadYQxz HKF+WBaPC9z/L6JjP+gx+uUzTWRIfBmhHylhWbKr4vLGfx3Jc1g= =Rkq2 -----END PGP SIGNATURE----- Merge tag 'bcachefs-2024-10-14' of git://evilpiepirate.org/bcachefs Pull bcachefs fixes from Kent Overstreet: - New metadata version inode_has_child_snapshots This fixes bugs with handling of unlinked inodes + snapshots, in particular when an inode is reattached after taking a snapshot; deleted inodes now get correctly cleaned up across snapshots. - Disk accounting rewrite fixes - validation fixes for when a device has been removed - fix journal replay failing with "journal_reclaim_would_deadlock" - Some more small fixes for erasure coding + device removal - Assorted small syzbot fixes * tag 'bcachefs-2024-10-14' of git://evilpiepirate.org/bcachefs: (27 commits) bcachefs: Fix sysfs warning in fstests generic/730,731 bcachefs: Handle race between stripe reuse, invalidate_stripe_to_dev bcachefs: Fix kasan splat in new_stripe_alloc_buckets() bcachefs: Add missing validation for bch_stripe.csum_granularity_bits bcachefs: Fix missing bounds checks in bch2_alloc_read() bcachefs: fix uaf in bch2_dio_write_done() bcachefs: Improve check_snapshot_exists() bcachefs: Fix bkey_nocow_lock() bcachefs: Fix accounting replay flags bcachefs: Fix invalid shift in member_to_text() bcachefs: Fix bch2_have_enough_devs() for BCH_SB_MEMBER_INVALID bcachefs: __wait_for_freeing_inode: Switch to wait_bit_queue_entry bcachefs: Check if stuck in journal_res_get() closures: Add closure_wait_event_timeout() bcachefs: Fix state lock involved deadlock bcachefs: Fix NULL pointer dereference in bch2_opt_to_text bcachefs: Release transaction before wake up bcachefs: add check for btree id against max in try read node bcachefs: Disk accounting device validation fixes bcachefs: bch2_inode_or_descendents_is_open() ...
This commit is contained in:
commit
bdc7276512
@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (k.k->p.offset < ca->mi.first_bucket) {
|
||||
bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (k.k->p.offset >= ca->mi.nbuckets) {
|
||||
bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
|
||||
continue;
|
||||
}
|
||||
|
||||
struct bch_alloc_v4 a;
|
||||
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
|
||||
0;
|
||||
|
@ -678,7 +678,8 @@ struct bch_sb_field_ext {
|
||||
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
|
||||
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
|
||||
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
|
||||
x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
|
||||
x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
|
||||
x(inode_has_child_snapshots, BCH_VERSION(1, 13))
|
||||
|
||||
enum bcachefs_metadata_version {
|
||||
bcachefs_metadata_version_min = 9,
|
||||
|
@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c)
|
||||
u64 b, start_time = local_clock();
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Ideally we would be using state_lock and not gc_gens_lock here, but that
|
||||
* introduces a deadlock in the RO path - we currently take the state
|
||||
* lock at the start of going RO, thus the gc thread may get stuck:
|
||||
*/
|
||||
if (!mutex_trylock(&c->gc_gens_lock))
|
||||
return 0;
|
||||
|
||||
trace_and_count(c, gc_gens_start, c);
|
||||
|
||||
down_read(&c->state_lock);
|
||||
/*
|
||||
* We have to use trylock here. Otherwise, we would
|
||||
* introduce a deadlock in the RO path - we take the
|
||||
* state lock at the start of going RO.
|
||||
*/
|
||||
if (!down_read_trylock(&c->state_lock)) {
|
||||
mutex_unlock(&c->gc_gens_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
struct bucket_gens *gens = bucket_gens(ca);
|
||||
|
@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
|
||||
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
|
||||
|
||||
/* we don't need transaction context anymore after we got the lock. */
|
||||
bch2_trans_put(trans);
|
||||
__btree_node_write_done(c, b);
|
||||
six_unlock_read(&b->c.lock);
|
||||
|
||||
bch2_trans_put(trans);
|
||||
}
|
||||
|
||||
static void btree_node_write_work(struct work_struct *work)
|
||||
|
@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
|
||||
else
|
||||
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
|
||||
|
||||
if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
|
||||
? bkey_gt(iter_pos, end)
|
||||
: bkey_ge(iter_pos, end)))
|
||||
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
|
||||
iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
|
||||
bkey_gt(iter_pos, end)))
|
||||
goto end;
|
||||
|
||||
break;
|
||||
|
@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
||||
for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
|
||||
SPOS_MAX, _flags, _k, _ret)
|
||||
|
||||
#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
|
||||
_start, _flags, _k, _ret) \
|
||||
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
||||
(_start), (_flags)); \
|
||||
(_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
|
||||
!((_ret) = bkey_err(_k)) && (_k).k; \
|
||||
bch2_btree_iter_rewind(&(_iter)))
|
||||
|
||||
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
|
||||
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
|
||||
|
||||
|
@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
||||
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
|
||||
return;
|
||||
|
||||
if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
struct found_btree_node n = {
|
||||
.btree_id = BTREE_NODE_ID(bn),
|
||||
|
@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
|
||||
if (ptr2 == ptr)
|
||||
break;
|
||||
|
||||
ca = bch2_dev_have_ref(c, ptr2->dev);
|
||||
bucket = PTR_BUCKET_POS(ca, ptr2);
|
||||
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
|
||||
}
|
||||
|
@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k)
|
||||
*p = swab64(*p);
|
||||
}
|
||||
|
||||
static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
|
||||
struct disk_accounting_pos acc)
|
||||
{
|
||||
unsafe_memcpy(r, &acc.replicas,
|
||||
replicas_entry_bytes(&acc.replicas),
|
||||
"variable length struct");
|
||||
}
|
||||
|
||||
static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
|
||||
{
|
||||
struct disk_accounting_pos acc_k;
|
||||
@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
|
||||
|
||||
switch (acc_k.type) {
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
unsafe_memcpy(r, &acc_k.replicas,
|
||||
replicas_entry_bytes(&acc_k.replicas),
|
||||
"variable length struct");
|
||||
__accounting_to_replicas(r, acc_k);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
|
||||
struct disk_accounting_pos acc,
|
||||
u64 *v, unsigned nr)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0, invalid_dev = -1;
|
||||
|
||||
switch (acc.type) {
|
||||
case BCH_DISK_ACCOUNTING_replicas: {
|
||||
struct bch_replicas_padded r;
|
||||
__accounting_to_replicas(&r.e, acc);
|
||||
|
||||
for (unsigned i = 0; i < r.e.nr_devs; i++)
|
||||
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
|
||||
!bch2_dev_exists(c, r.e.devs[i])) {
|
||||
invalid_dev = r.e.devs[i];
|
||||
goto invalid_device;
|
||||
}
|
||||
|
||||
/*
|
||||
* All replicas entry checks except for invalid device are done
|
||||
* in bch2_accounting_validate
|
||||
*/
|
||||
BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
|
||||
|
||||
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
|
||||
trans, accounting_replicas_not_marked,
|
||||
"accounting not marked in superblock replicas\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_accounting_key_to_text(&buf, &acc),
|
||||
buf.buf))) {
|
||||
/*
|
||||
* We're not RW yet and still single threaded, dropping
|
||||
* and retaking lock is ok:
|
||||
*/
|
||||
percpu_up_write(&c->mark_lock);
|
||||
ret = bch2_mark_replicas(c, &r.e);
|
||||
if (ret)
|
||||
goto fsck_err;
|
||||
percpu_down_write(&c->mark_lock);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type:
|
||||
if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
|
||||
invalid_dev = acc.dev_data_type.dev;
|
||||
goto invalid_device;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
invalid_device:
|
||||
if (fsck_err(trans, accounting_to_invalid_device,
|
||||
"accounting entry points to invalid device %i\n %s",
|
||||
invalid_dev,
|
||||
(printbuf_reset(&buf),
|
||||
bch2_accounting_key_to_text(&buf, &acc),
|
||||
buf.buf))) {
|
||||
for (unsigned i = 0; i < nr; i++)
|
||||
v[i] = -v[i];
|
||||
|
||||
ret = commit_do(trans, NULL, NULL, 0,
|
||||
bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
|
||||
-BCH_ERR_remove_disk_accounting_entry;
|
||||
} else {
|
||||
ret = -BCH_ERR_remove_disk_accounting_entry;
|
||||
}
|
||||
goto fsck_err;
|
||||
}
|
||||
|
||||
/*
|
||||
* At startup time, initialize the in memory accounting from the btree (and
|
||||
* journal)
|
||||
@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
}
|
||||
keys->gap = keys->nr = dst - keys->data;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
for (unsigned i = 0; i < acc->k.nr; i++) {
|
||||
percpu_down_write(&c->mark_lock);
|
||||
unsigned i = 0;
|
||||
while (i < acc->k.nr) {
|
||||
unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
|
||||
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
|
||||
|
||||
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
|
||||
|
||||
if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
|
||||
continue;
|
||||
|
||||
struct bch_replicas_padded r;
|
||||
if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
|
||||
continue;
|
||||
bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
|
||||
|
||||
/*
|
||||
* If the replicas entry is invalid it'll get cleaned up by
|
||||
* check_allocations:
|
||||
* If the entry counters are zeroed, it should be treated as
|
||||
* nonexistent - it might point to an invalid device.
|
||||
*
|
||||
* Remove it, so that if it's re-added it gets re-marked in the
|
||||
* superblock:
|
||||
*/
|
||||
if (bch2_replicas_entry_validate(&r.e, c, &buf))
|
||||
ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
|
||||
? -BCH_ERR_remove_disk_accounting_entry
|
||||
: bch2_disk_accounting_validate_late(trans, acc_k,
|
||||
v, acc->k.data[idx].nr_counters);
|
||||
|
||||
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
|
||||
free_percpu(acc->k.data[idx].v[0]);
|
||||
free_percpu(acc->k.data[idx].v[1]);
|
||||
darray_remove_item(&acc->k, &acc->k.data[idx]);
|
||||
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, NULL);
|
||||
ret = 0;
|
||||
continue;
|
||||
|
||||
struct disk_accounting_pos k;
|
||||
bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
|
||||
|
||||
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
|
||||
trans, accounting_replicas_not_marked,
|
||||
"accounting not marked in superblock replicas\n %s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_accounting_key_to_text(&buf, &k),
|
||||
buf.buf))) {
|
||||
/*
|
||||
* We're not RW yet and still single threaded, dropping
|
||||
* and retaking lock is ok:
|
||||
*/
|
||||
percpu_up_read(&c->mark_lock);
|
||||
ret = bch2_mark_replicas(c, &r.e);
|
||||
if (ret)
|
||||
goto fsck_err;
|
||||
percpu_down_read(&c->mark_lock);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
goto fsck_err;
|
||||
i++;
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
}
|
||||
preempt_enable();
|
||||
fsck_err:
|
||||
percpu_up_read(&c->mark_lock);
|
||||
percpu_up_write(&c->mark_lock);
|
||||
err:
|
||||
printbuf_exit(&buf);
|
||||
bch2_trans_put(trans);
|
||||
|
@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
"incorrect value size (%zu < %u)",
|
||||
bkey_val_u64s(k.k), stripe_val_u64s(s));
|
||||
|
||||
bkey_fsck_err_on(s->csum_granularity_bits >= 64,
|
||||
c, stripe_csum_granularity_bad,
|
||||
"invalid csum granularity (%u >= 64)",
|
||||
s->csum_granularity_bits);
|
||||
|
||||
ret = bch2_bkey_ptrs_validate(c, k, flags);
|
||||
fsck_err:
|
||||
return ret;
|
||||
@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
nr_data,
|
||||
s.nr_redundant);
|
||||
bch2_prt_csum_type(out, s.csum_type);
|
||||
prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
|
||||
prt_str(out, " gran ");
|
||||
if (s.csum_granularity_bits < 64)
|
||||
prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
|
||||
else
|
||||
prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
|
||||
|
||||
if (s.disk_label) {
|
||||
prt_str(out, " label");
|
||||
@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
|
||||
/* stripe creation: */
|
||||
|
||||
static int ec_stripe_key_update(struct btree_trans *trans,
|
||||
struct bkey_i_stripe *new,
|
||||
bool create)
|
||||
struct bkey_i_stripe *old,
|
||||
struct bkey_i_stripe *new)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
bool create = !old;
|
||||
|
||||
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
|
||||
new->k.p, BTREE_ITER_intent);
|
||||
ret = bkey_err(k);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
|
||||
new->k.p, BTREE_ITER_intent);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
|
||||
bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
|
||||
create ? "creating" : "updating",
|
||||
bch2_bkey_types[k.k->type]);
|
||||
if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
|
||||
c, "error %s stripe: got existing key type %s",
|
||||
create ? "creating" : "updating",
|
||||
bch2_bkey_types[k.k->type])) {
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (k.k->type == KEY_TYPE_stripe) {
|
||||
const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
|
||||
unsigned i;
|
||||
const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
|
||||
|
||||
if (old->nr_blocks != new->v.nr_blocks) {
|
||||
bch_err(c, "error updating stripe: nr_blocks does not match");
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
|
||||
BUG_ON(old->v.nr_blocks != v->nr_blocks);
|
||||
|
||||
for (i = 0; i < new->v.nr_blocks; i++) {
|
||||
unsigned v = stripe_blockcount_get(old, i);
|
||||
for (unsigned i = 0; i < new->v.nr_blocks; i++) {
|
||||
unsigned sectors = stripe_blockcount_get(v, i);
|
||||
|
||||
BUG_ON(v &&
|
||||
(old->ptrs[i].dev != new->v.ptrs[i].dev ||
|
||||
old->ptrs[i].gen != new->v.ptrs[i].gen ||
|
||||
old->ptrs[i].offset != new->v.ptrs[i].offset));
|
||||
if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
stripe_blockcount_set(&new->v, i, v);
|
||||
prt_printf(&buf, "stripe changed nonempty block %u", i);
|
||||
prt_str(&buf, "\nold: ");
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
prt_str(&buf, "\nnew: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
|
||||
bch2_fs_inconsistent(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the stripe ptr changed underneath us, it must have
|
||||
* been dev_remove_stripes() -> * invalidate_stripe_to_dev()
|
||||
*/
|
||||
if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
|
||||
BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
|
||||
|
||||
if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
|
||||
new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
|
||||
}
|
||||
|
||||
stripe_blockcount_set(&new->v, i, sectors);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1499,8 +1523,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
|
||||
BCH_TRANS_COMMIT_no_check_rw|
|
||||
BCH_TRANS_COMMIT_no_enospc,
|
||||
ec_stripe_key_update(trans,
|
||||
bkey_i_to_stripe(&s->new_stripe.key),
|
||||
!s->have_existing_stripe));
|
||||
s->have_existing_stripe
|
||||
? bkey_i_to_stripe(&s->existing_stripe.key)
|
||||
: NULL,
|
||||
bkey_i_to_stripe(&s->new_stripe.key)));
|
||||
bch_err_msg(c, ret, "creating stripe key");
|
||||
if (ret) {
|
||||
goto err;
|
||||
@ -1876,7 +1902,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
|
||||
|
||||
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
|
||||
__clear_bit(v->ptrs[i].dev, devs.d);
|
||||
/*
|
||||
* Note: we don't yet repair invalid blocks (failed/removed
|
||||
* devices) when reusing stripes - we still need a codepath to
|
||||
* walk backpointers and update all extents that point to that
|
||||
* block when updating the stripe
|
||||
*/
|
||||
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
|
||||
__clear_bit(v->ptrs[i].dev, devs.d);
|
||||
|
||||
if (i < h->s->nr_data)
|
||||
nr_have_data++;
|
||||
else
|
||||
|
@ -268,7 +268,8 @@
|
||||
x(BCH_ERR_nopromote, nopromote_no_writes) \
|
||||
x(BCH_ERR_nopromote, nopromote_enomem) \
|
||||
x(0, invalid_snapshot_node) \
|
||||
x(0, option_needs_open_fs)
|
||||
x(0, option_needs_open_fs) \
|
||||
x(0, remove_disk_accounting_entry)
|
||||
|
||||
enum bch_errcode {
|
||||
BCH_ERR_START = 2048,
|
||||
|
@ -695,6 +695,16 @@ void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
|
||||
int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
|
||||
static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
|
||||
struct bch_extent_ptr ptr2)
|
||||
{
|
||||
return (ptr1.cached == ptr2.cached &&
|
||||
ptr1.unwritten == ptr2.unwritten &&
|
||||
ptr1.offset == ptr2.offset &&
|
||||
ptr1.dev == ptr2.dev &&
|
||||
ptr1.dev == ptr2.dev);
|
||||
}
|
||||
|
||||
void bch2_ptr_swab(struct bkey_s);
|
||||
|
||||
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
|
||||
|
@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
|
||||
|
||||
static __always_inline long bch2_dio_write_done(struct dio_write *dio)
|
||||
{
|
||||
struct bch_fs *c = dio->op.c;
|
||||
struct kiocb *req = dio->req;
|
||||
struct bch_inode_info *inode = dio->inode;
|
||||
bool sync = dio->sync;
|
||||
@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
|
||||
ret = dio->op.error ?: ((long) dio->written << 9);
|
||||
bio_put(&dio->op.wbio.bio);
|
||||
|
||||
bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
|
||||
|
||||
/* inode->i_dio_count is our ref on inode and thus bch_fs */
|
||||
inode_dio_end(&inode->v);
|
||||
|
102
fs/bcachefs/fs.c
102
fs/bcachefs/fs.c
@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
|
||||
return a.subvol == b.subvol && a.inum == b.inum;
|
||||
}
|
||||
|
||||
static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
|
||||
{
|
||||
const subvol_inum *inum = data;
|
||||
|
||||
return jhash(&inum->inum, sizeof(inum->inum), seed);
|
||||
}
|
||||
|
||||
static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
|
||||
{
|
||||
const struct bch_inode_info *inode = data;
|
||||
|
||||
return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
|
||||
}
|
||||
|
||||
static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
|
||||
const void *obj)
|
||||
{
|
||||
@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
|
||||
.head_offset = offsetof(struct bch_inode_info, hash),
|
||||
.key_offset = offsetof(struct bch_inode_info, ei_inum),
|
||||
.key_len = sizeof(subvol_inum),
|
||||
.hashfn = bch2_vfs_inode_hash_fn,
|
||||
.obj_hashfn = bch2_vfs_inode_obj_hash_fn,
|
||||
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
|
||||
.automatic_shrinking = true,
|
||||
};
|
||||
|
||||
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
|
||||
int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct rhashtable *ht = &c->vfs_inodes_table;
|
||||
subvol_inum inum = (subvol_inum) { .inum = p.offset };
|
||||
DARRAY(u32) subvols;
|
||||
int ret = 0;
|
||||
|
||||
if (!test_bit(BCH_FS_started, &c->flags))
|
||||
return false;
|
||||
|
||||
darray_init(&subvols);
|
||||
restart_from_top:
|
||||
|
||||
/*
|
||||
* Tweaked version of __rhashtable_lookup(); we need to get a list of
|
||||
* subvolumes in which the given inode number is open.
|
||||
*
|
||||
* For this to work, we don't include the subvolume ID in the key that
|
||||
* we hash - all inodes with the same inode number regardless of
|
||||
* subvolume will hash to the same slot.
|
||||
*
|
||||
* This will be less than ideal if the same file is ever open
|
||||
* simultaneously in many different snapshots:
|
||||
*/
|
||||
rcu_read_lock();
|
||||
struct rhash_lock_head __rcu *const *bkt;
|
||||
struct rhash_head *he;
|
||||
unsigned int hash;
|
||||
struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
|
||||
restart:
|
||||
hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
|
||||
bkt = rht_bucket(tbl, hash);
|
||||
do {
|
||||
struct bch_inode_info *inode;
|
||||
|
||||
rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
|
||||
if (inode->ei_inum.inum == inum.inum) {
|
||||
ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
|
||||
GFP_NOWAIT|__GFP_NOWARN);
|
||||
if (ret) {
|
||||
rcu_read_unlock();
|
||||
ret = darray_make_room(&subvols, 1);
|
||||
if (ret)
|
||||
goto err;
|
||||
subvols.nr = 0;
|
||||
goto restart_from_top;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* An object might have been moved to a different hash chain,
|
||||
* while we walk along it - better check and retry.
|
||||
*/
|
||||
} while (he != RHT_NULLS_MARKER(bkt));
|
||||
|
||||
/* Ensure we see any new tables. */
|
||||
smp_rmb();
|
||||
|
||||
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
|
||||
if (unlikely(tbl))
|
||||
goto restart;
|
||||
rcu_read_unlock();
|
||||
|
||||
darray_for_each(subvols, i) {
|
||||
u32 snap;
|
||||
ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
err:
|
||||
darray_exit(&subvols);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
|
||||
{
|
||||
return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
|
||||
}
|
||||
@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c,
|
||||
subvol_inum inum)
|
||||
{
|
||||
wait_queue_head_t *wq;
|
||||
DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
|
||||
struct wait_bit_queue_entry wait;
|
||||
|
||||
wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
|
||||
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
|
||||
|
||||
set_bit(EI_INODE_HASHED, &inode->ei_flags);
|
||||
retry:
|
||||
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
|
||||
if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
|
||||
&inode->ei_inum,
|
||||
&inode->hash,
|
||||
bch2_vfs_inodes_params))) {
|
||||
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
|
||||
|
@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
|
||||
return inode->ei_inum;
|
||||
}
|
||||
|
||||
struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
|
||||
|
||||
/*
|
||||
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
|
||||
* btree inode may be inconsistent:
|
||||
@ -148,6 +146,8 @@ struct bch_inode_info *
|
||||
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
|
||||
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
|
||||
|
||||
int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
|
||||
|
||||
int bch2_fs_quota_transfer(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
struct bch_qid,
|
||||
@ -198,10 +198,7 @@ int bch2_vfs_init(void);
|
||||
|
||||
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
|
||||
|
||||
static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
|
||||
|
||||
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
|
||||
snapshot_id_list *s) {}
|
||||
|
@ -326,17 +326,54 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
|
||||
{
|
||||
if (inode->bi_inum == BCACHEFS_ROOT_INO &&
|
||||
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
|
||||
return false;
|
||||
|
||||
return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
|
||||
}
|
||||
|
||||
static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
|
||||
SPOS(d_pos.inode, d_pos.offset, snapshot),
|
||||
BTREE_ITER_intent|
|
||||
BTREE_ITER_with_updates);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (bpos_eq(k.k->p, d_pos)) {
|
||||
/*
|
||||
* delet_at() doesn't work because the update path doesn't
|
||||
* internally use BTREE_ITER_with_updates yet
|
||||
*/
|
||||
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
ret = PTR_ERR_OR_ZERO(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.type = KEY_TYPE_whiteout;
|
||||
k->k.p = iter.pos;
|
||||
ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
|
||||
}
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_hash_info dir_hash;
|
||||
struct bch_inode_unpacked lostfound;
|
||||
char name_buf[20];
|
||||
struct qstr name;
|
||||
u64 dir_offset = 0;
|
||||
u32 dirent_snapshot = inode->bi_snapshot;
|
||||
int ret;
|
||||
|
||||
u32 dirent_snapshot = inode->bi_snapshot;
|
||||
if (inode->bi_subvol) {
|
||||
inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
|
||||
|
||||
@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
dir_hash = bch2_hash_info_init(c, &lostfound);
|
||||
struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
|
||||
struct qstr name = (struct qstr) QSTR(name_buf);
|
||||
|
||||
name = (struct qstr) QSTR(name_buf);
|
||||
inode->bi_dir = lostfound.bi_inum;
|
||||
|
||||
ret = bch2_dirent_create_snapshot(trans,
|
||||
inode->bi_parent_subvol, lostfound.bi_inum,
|
||||
@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
|
||||
inode_d_type(inode),
|
||||
&name,
|
||||
inode->bi_subvol ?: inode->bi_inum,
|
||||
&dir_offset,
|
||||
&inode->bi_dir_offset,
|
||||
STR_HASH_must_create);
|
||||
if (ret) {
|
||||
bch_err_msg(c, ret, "error creating dirent");
|
||||
return ret;
|
||||
}
|
||||
|
||||
inode->bi_dir = lostfound.bi_inum;
|
||||
inode->bi_dir_offset = dir_offset;
|
||||
ret = __bch2_fsck_write_inode(trans, inode);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return __bch2_fsck_write_inode(trans, inode);
|
||||
/*
|
||||
* Fix up inodes in child snapshots: if they should also be reattached
|
||||
* update the backpointer field, if they should not be we need to emit
|
||||
* whiteouts for the dirent we just created.
|
||||
*/
|
||||
if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
|
||||
snapshot_id_list whiteouts_done;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
|
||||
darray_init(&whiteouts_done);
|
||||
|
||||
for_each_btree_key_reverse_norestart(trans, iter,
|
||||
BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
|
||||
BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
|
||||
if (k.k->p.offset != inode->bi_inum)
|
||||
break;
|
||||
|
||||
if (!bkey_is_inode(k.k) ||
|
||||
!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
|
||||
snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
|
||||
continue;
|
||||
|
||||
struct bch_inode_unpacked child_inode;
|
||||
bch2_inode_unpack(k, &child_inode);
|
||||
|
||||
if (!inode_should_reattach(&child_inode)) {
|
||||
ret = maybe_delete_dirent(trans,
|
||||
SPOS(lostfound.bi_inum, inode->bi_dir_offset,
|
||||
dirent_snapshot),
|
||||
k.k->p.snapshot);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
|
||||
if (ret)
|
||||
break;
|
||||
} else {
|
||||
iter.snapshot = k.k->p.snapshot;
|
||||
child_inode.bi_dir = inode->bi_dir;
|
||||
child_inode.bi_dir_offset = inode->bi_dir_offset;
|
||||
|
||||
ret = bch2_inode_write_flags(trans, &iter, &child_inode,
|
||||
BTREE_UPDATE_internal_snapshot_node);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
}
|
||||
darray_exit(&whiteouts_done);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int remove_backpointer(struct btree_trans *trans,
|
||||
@ -994,7 +1085,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
|
||||
*/
|
||||
inode->bi_dir = 0;
|
||||
inode->bi_dir_offset = 0;
|
||||
inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
|
||||
*write_inode = true;
|
||||
}
|
||||
|
||||
@ -1006,28 +1096,11 @@ fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
|
||||
{
|
||||
subvol_inum inum = {
|
||||
.subvol = snapshot_t(c, p.snapshot)->subvol,
|
||||
.inum = p.offset,
|
||||
};
|
||||
|
||||
/* snapshot tree corruption, can't safely delete */
|
||||
if (!inum.subvol) {
|
||||
bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
|
||||
return true;
|
||||
}
|
||||
|
||||
return __bch2_inode_hash_find(c, inum) != NULL;
|
||||
}
|
||||
|
||||
static int check_inode(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k,
|
||||
struct bch_inode_unpacked *prev,
|
||||
struct snapshots_seen *s,
|
||||
bool full)
|
||||
struct snapshots_seen *s)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
@ -1050,12 +1123,6 @@ static int check_inode(struct btree_trans *trans,
|
||||
|
||||
BUG_ON(bch2_inode_unpack(k, &u));
|
||||
|
||||
if (!full &&
|
||||
!(u.bi_flags & (BCH_INODE_i_size_dirty|
|
||||
BCH_INODE_i_sectors_dirty|
|
||||
BCH_INODE_unlinked)))
|
||||
return 0;
|
||||
|
||||
if (prev->bi_inum != u.bi_inum)
|
||||
*prev = u;
|
||||
|
||||
@ -1101,28 +1168,27 @@ static int check_inode(struct btree_trans *trans,
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
|
||||
bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
|
||||
struct bpos new_min_pos;
|
||||
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
|
||||
if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
|
||||
trans, inode_has_child_snapshots_wrong,
|
||||
"inode has_child_snapshots flag wrong (should be %u)\n%s",
|
||||
ret,
|
||||
(printbuf_reset(&buf),
|
||||
bch2_inode_unpacked_to_text(&buf, &u),
|
||||
buf.buf))) {
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
|
||||
|
||||
ret = __bch2_fsck_write_inode(trans, &u);
|
||||
|
||||
bch_err_msg(c, ret, "in fsck updating inode");
|
||||
if (ret)
|
||||
goto err_noprint;
|
||||
|
||||
if (!bpos_eq(new_min_pos, POS_MIN))
|
||||
bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
|
||||
goto err_noprint;
|
||||
u.bi_flags |= BCH_INODE_has_child_snapshot;
|
||||
else
|
||||
u.bi_flags &= ~BCH_INODE_has_child_snapshot;
|
||||
do_update = true;
|
||||
}
|
||||
ret = 0;
|
||||
|
||||
if (u.bi_flags & BCH_INODE_unlinked) {
|
||||
if ((u.bi_flags & BCH_INODE_unlinked) &&
|
||||
!(u.bi_flags & BCH_INODE_has_child_snapshot)) {
|
||||
if (!test_bit(BCH_FS_started, &c->flags)) {
|
||||
/*
|
||||
* If we're not in online fsck, don't delete unlinked
|
||||
@ -1147,7 +1213,11 @@ static int check_inode(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
} else {
|
||||
if (fsck_err_on(!bch2_inode_is_open(c, k.k->p),
|
||||
ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(!ret,
|
||||
trans, inode_unlinked_and_not_open,
|
||||
"inode %llu%u unlinked and not open",
|
||||
u.bi_inum, u.bi_snapshot)) {
|
||||
@ -1155,69 +1225,10 @@ static int check_inode(struct btree_trans *trans,
|
||||
bch_err_msg(c, ret, "in fsck deleting inode");
|
||||
goto err_noprint;
|
||||
}
|
||||
ret = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* i_size_dirty is vestigal, since we now have logged ops for truncate * */
|
||||
if (u.bi_flags & BCH_INODE_i_size_dirty &&
|
||||
(!test_bit(BCH_FS_clean_recovery, &c->flags) ||
|
||||
fsck_err(trans, inode_i_size_dirty_but_clean,
|
||||
"filesystem marked clean, but inode %llu has i_size dirty",
|
||||
u.bi_inum))) {
|
||||
bch_verbose(c, "truncating inode %llu", u.bi_inum);
|
||||
|
||||
/*
|
||||
* XXX: need to truncate partial blocks too here - or ideally
|
||||
* just switch units to bytes and that issue goes away
|
||||
*/
|
||||
ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
|
||||
SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
|
||||
iter->pos.snapshot),
|
||||
POS(u.bi_inum, U64_MAX),
|
||||
0, NULL);
|
||||
bch_err_msg(c, ret, "in fsck truncating inode");
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* We truncated without our normal sector accounting hook, just
|
||||
* make sure we recalculate it:
|
||||
*/
|
||||
u.bi_flags |= BCH_INODE_i_sectors_dirty;
|
||||
|
||||
u.bi_flags &= ~BCH_INODE_i_size_dirty;
|
||||
do_update = true;
|
||||
}
|
||||
|
||||
/* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
|
||||
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
|
||||
(!test_bit(BCH_FS_clean_recovery, &c->flags) ||
|
||||
fsck_err(trans, inode_i_sectors_dirty_but_clean,
|
||||
"filesystem marked clean, but inode %llu has i_sectors dirty",
|
||||
u.bi_inum))) {
|
||||
s64 sectors;
|
||||
|
||||
bch_verbose(c, "recounting sectors for inode %llu",
|
||||
u.bi_inum);
|
||||
|
||||
sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
|
||||
if (sectors < 0) {
|
||||
bch_err_msg(c, sectors, "in fsck recounting inode sectors");
|
||||
return sectors;
|
||||
}
|
||||
|
||||
u.bi_sectors = sectors;
|
||||
u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
|
||||
do_update = true;
|
||||
}
|
||||
|
||||
if (u.bi_flags & BCH_INODE_backptr_untrusted) {
|
||||
u.bi_dir = 0;
|
||||
u.bi_dir_offset = 0;
|
||||
u.bi_flags &= ~BCH_INODE_backptr_untrusted;
|
||||
do_update = true;
|
||||
}
|
||||
|
||||
if (fsck_err_on(u.bi_parent_subvol &&
|
||||
(u.bi_subvol == 0 ||
|
||||
u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
|
||||
@ -1274,7 +1285,6 @@ err_noprint:
|
||||
|
||||
int bch2_check_inodes(struct bch_fs *c)
|
||||
{
|
||||
bool full = c->opts.fsck;
|
||||
struct bch_inode_unpacked prev = { 0 };
|
||||
struct snapshots_seen s;
|
||||
|
||||
@ -1285,13 +1295,104 @@ int bch2_check_inodes(struct bch_fs *c)
|
||||
POS_MIN,
|
||||
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
||||
check_inode(trans, &iter, k, &prev, &s, full)));
|
||||
check_inode(trans, &iter, k, &prev, &s)));
|
||||
|
||||
snapshots_seen_exit(&s);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
|
||||
struct bch_inode_unpacked *inode)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We look for inodes to reattach in natural key order, leaves first,
|
||||
* but we should do the reattach at the oldest version that needs to be
|
||||
* reattached:
|
||||
*/
|
||||
for_each_btree_key_norestart(trans, iter,
|
||||
BTREE_ID_inodes,
|
||||
SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
|
||||
BTREE_ITER_all_snapshots, k, ret) {
|
||||
if (k.k->p.offset != inode->bi_inum)
|
||||
break;
|
||||
|
||||
if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
|
||||
continue;
|
||||
|
||||
if (!bkey_is_inode(k.k))
|
||||
break;
|
||||
|
||||
struct bch_inode_unpacked parent_inode;
|
||||
bch2_inode_unpack(k, &parent_inode);
|
||||
|
||||
if (!inode_should_reattach(&parent_inode))
|
||||
break;
|
||||
|
||||
*inode = parent_inode;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int check_unreachable_inode(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
if (!bkey_is_inode(k.k))
|
||||
return 0;
|
||||
|
||||
struct bch_inode_unpacked inode;
|
||||
BUG_ON(bch2_inode_unpack(k, &inode));
|
||||
|
||||
if (!inode_should_reattach(&inode))
|
||||
return 0;
|
||||
|
||||
ret = find_oldest_inode_needs_reattach(trans, &inode);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (fsck_err(trans, inode_unreachable,
|
||||
"unreachable inode:\n%s",
|
||||
(bch2_inode_unpacked_to_text(&buf, &inode),
|
||||
buf.buf)))
|
||||
ret = reattach_inode(trans, &inode);
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reattach unreachable (but not unlinked) inodes
|
||||
*
|
||||
* Run after check_inodes() and check_dirents(), so we node that inode
|
||||
* backpointer fields point to valid dirents, and every inode that has a dirent
|
||||
* that points to it has its backpointer field set - so we're just looking for
|
||||
* non-unlinked inodes without backpointers:
|
||||
*
|
||||
* XXX: this is racy w.r.t. hardlink removal in online fsck
|
||||
*/
|
||||
int bch2_check_unreachable_inodes(struct bch_fs *c)
|
||||
{
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
|
||||
POS_MIN,
|
||||
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
||||
check_unreachable_inode(trans, &iter, k)));
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
|
||||
{
|
||||
switch (btree) {
|
||||
@ -1694,8 +1795,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
!key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
|
||||
continue;
|
||||
|
||||
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
|
||||
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
|
||||
if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
|
||||
!bkey_extent_is_reservation(k),
|
||||
trans, extent_past_end_of_inode,
|
||||
"extent type past end of inode %llu:%u, i_size %llu\n %s",
|
||||
@ -2450,22 +2550,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We've checked that inode backpointers point to valid dirents;
|
||||
* here, it's sufficient to check that the subvolume root has a
|
||||
* dirent:
|
||||
*/
|
||||
if (fsck_err_on(!subvol_root.bi_dir,
|
||||
trans, subvol_unreachable,
|
||||
"unreachable subvolume %s",
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c),
|
||||
prt_newline(&buf),
|
||||
bch2_inode_unpacked_to_text(&buf, &subvol_root),
|
||||
buf.buf))) {
|
||||
ret = reattach_subvol(trans, s);
|
||||
break;
|
||||
}
|
||||
|
||||
u32 parent = le32_to_cpu(s.v->fs_path_parent);
|
||||
|
||||
if (darray_u32_has(&subvol_path, parent)) {
|
||||
@ -2526,12 +2610,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that a given inode is reachable from its subvolume root - we already
|
||||
* verified subvolume connectivity:
|
||||
*
|
||||
* XXX: we should also be verifying that inodes are in the right subvolumes
|
||||
*/
|
||||
static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -2545,6 +2623,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
|
||||
|
||||
BUG_ON(bch2_inode_unpack(inode_k, &inode));
|
||||
|
||||
if (!S_ISDIR(inode.bi_mode))
|
||||
return 0;
|
||||
|
||||
while (!inode.bi_subvol) {
|
||||
struct btree_iter dirent_iter;
|
||||
struct bkey_s_c_dirent d;
|
||||
@ -2559,21 +2640,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
|
||||
bch2_trans_iter_exit(trans, &dirent_iter);
|
||||
|
||||
if (bch2_err_matches(ret, ENOENT)) {
|
||||
ret = 0;
|
||||
if (fsck_err(trans, inode_unreachable,
|
||||
"unreachable inode\n%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, inode_k),
|
||||
buf.buf)))
|
||||
ret = reattach_inode(trans, &inode);
|
||||
printbuf_reset(&buf);
|
||||
bch2_bkey_val_to_text(&buf, c, inode_k);
|
||||
bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
|
||||
bch2_err_str(ret), buf.buf);
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_trans_iter_exit(trans, &dirent_iter);
|
||||
|
||||
if (!S_ISDIR(inode.bi_mode))
|
||||
break;
|
||||
|
||||
ret = darray_push(p, ((struct pathbuf_entry) {
|
||||
.inum = inode.bi_inum,
|
||||
.snapshot = snapshot,
|
||||
@ -2626,9 +2701,8 @@ fsck_err:
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for unreachable inodes, as well as loops in the directory structure:
|
||||
* After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
|
||||
* unreachable:
|
||||
* Check for loops in the directory structure: all other connectivity issues
|
||||
* have been fixed by prior passes
|
||||
*/
|
||||
int bch2_check_directory_structure(struct bch_fs *c)
|
||||
{
|
||||
@ -2756,6 +2830,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
|
||||
if (S_ISDIR(u.bi_mode))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Previous passes ensured that bi_nlink is nonzero if
|
||||
* it had multiple hardlinks:
|
||||
*/
|
||||
if (!u.bi_nlink)
|
||||
continue;
|
||||
|
||||
|
@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *);
|
||||
int bch2_check_xattrs(struct bch_fs *);
|
||||
int bch2_check_root(struct bch_fs *);
|
||||
int bch2_check_subvolume_structure(struct bch_fs *);
|
||||
int bch2_check_unreachable_inodes(struct bch_fs *);
|
||||
int bch2_check_directory_structure(struct bch_fs *);
|
||||
int bch2_check_nlinks(struct bch_fs *);
|
||||
int bch2_fix_reflink_p(struct bch_fs *);
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "extent_update.h"
|
||||
#include "fs.h"
|
||||
#include "inode.h"
|
||||
#include "str_hash.h"
|
||||
#include "snapshot.h"
|
||||
@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = {
|
||||
};
|
||||
#undef x
|
||||
|
||||
static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
|
||||
|
||||
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
|
||||
|
||||
static int inode_decode_field(const u8 *in, const u8 *end,
|
||||
@ -575,9 +578,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
|
||||
static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
|
||||
{
|
||||
return bkey_inode_flags(k) & BCH_INODE_unlinked;
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_inode:
|
||||
bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
|
||||
return;
|
||||
case KEY_TYPE_inode_v2:
|
||||
bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
|
||||
return;
|
||||
case KEY_TYPE_inode_v3:
|
||||
bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
|
||||
return;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
|
||||
{
|
||||
unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
|
||||
|
||||
return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
|
||||
}
|
||||
|
||||
static struct bkey_s_c
|
||||
bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
enum btree_id btree, struct bpos pos,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, *iter, btree,
|
||||
bpos_successor(pos),
|
||||
SPOS(pos.inode, pos.offset, U32_MAX),
|
||||
flags|BTREE_ITER_all_snapshots, k, ret)
|
||||
if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
|
||||
return k;
|
||||
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
|
||||
}
|
||||
|
||||
static struct bkey_s_c
|
||||
bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bpos pos, unsigned flags)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
again:
|
||||
k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
|
||||
if (!k.k ||
|
||||
bkey_err(k) ||
|
||||
bkey_is_inode(k.k))
|
||||
return k;
|
||||
|
||||
bch2_trans_iter_exit(trans, iter);
|
||||
pos = k.k->p;
|
||||
goto again;
|
||||
}
|
||||
|
||||
int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, iter,
|
||||
BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
|
||||
BTREE_ITER_all_snapshots|
|
||||
BTREE_ITER_with_updates, k, ret)
|
||||
if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
|
||||
bkey_is_inode(k.k)) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int update_inode_has_children(struct btree_trans *trans,
|
||||
struct bkey_s k,
|
||||
bool have_child)
|
||||
{
|
||||
if (!have_child) {
|
||||
int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
||||
if (ret)
|
||||
return ret < 0 ? ret : 0;
|
||||
}
|
||||
|
||||
u64 f = bkey_inode_flags(k.s_c);
|
||||
if (have_child != !!(f & BCH_INODE_has_child_snapshot))
|
||||
bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
|
||||
bool have_child)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
|
||||
&iter, pos, BTREE_ITER_with_updates);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (!k.k)
|
||||
return 0;
|
||||
|
||||
if (!have_child) {
|
||||
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
||||
if (ret) {
|
||||
ret = ret < 0 ? ret : 0;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
u64 f = bkey_inode_flags(k);
|
||||
if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
|
||||
struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
|
||||
BTREE_UPDATE_internal_snapshot_node);
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
|
||||
}
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_trigger_inode(struct btree_trans *trans,
|
||||
@ -586,6 +717,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
|
||||
struct bkey_s new,
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
|
||||
BUG_ON(!trans->journal_res.seq);
|
||||
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
|
||||
@ -599,13 +732,41 @@ int bch2_trigger_inode(struct btree_trans *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) -
|
||||
(int) bkey_is_deleted_inode(old);
|
||||
if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
|
||||
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
|
||||
new.k->p, deleted_delta > 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
|
||||
(int) bkey_is_unlinked_inode(old);
|
||||
if (unlinked_delta) {
|
||||
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
|
||||
new.k->p, unlinked_delta > 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we're creating or deleting an inode at this snapshot ID,
|
||||
* and there might be an inode in a parent snapshot ID, we might
|
||||
* need to set or clear the has_child_snapshot flag on the
|
||||
* parent.
|
||||
*/
|
||||
int deleted_delta = (int) bkey_is_inode(new.k) -
|
||||
(int) bkey_is_inode(old.k);
|
||||
if (deleted_delta &&
|
||||
bch2_snapshot_parent(c, new.k->p.snapshot)) {
|
||||
int ret = update_parent_inode_has_children(trans, new.k->p,
|
||||
deleted_delta > 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* When an inode is first updated in a new snapshot, we may need
|
||||
* to clear has_child_snapshot
|
||||
*/
|
||||
if (deleted_delta > 0) {
|
||||
int ret = update_inode_has_children(trans, new, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -888,6 +1049,11 @@ err:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto retry;
|
||||
|
||||
if (ret)
|
||||
goto err2;
|
||||
|
||||
ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
|
||||
err2:
|
||||
bch2_trans_put(trans);
|
||||
return ret;
|
||||
}
|
||||
@ -992,7 +1158,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
|
||||
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter = { NULL };
|
||||
@ -1055,6 +1221,45 @@ err:
|
||||
return ret ?: -BCH_ERR_transaction_restart_nested;
|
||||
}
|
||||
|
||||
/*
|
||||
* After deleting an inode, there may be versions in older snapshots that should
|
||||
* also be deleted - if they're not referenced by sibling snapshots and not open
|
||||
* in other subvolumes:
|
||||
*/
|
||||
static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
next_parent:
|
||||
ret = lockrestart_do(trans,
|
||||
bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
|
||||
if (ret || !k.k)
|
||||
return ret;
|
||||
|
||||
bool unlinked = bkey_is_unlinked_inode(k);
|
||||
pos = k.k->p;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (!unlinked)
|
||||
return 0;
|
||||
|
||||
ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
|
||||
if (ret)
|
||||
return ret < 0 ? ret : 0;
|
||||
|
||||
ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto next_parent;
|
||||
}
|
||||
|
||||
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
|
||||
{
|
||||
return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
|
||||
delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
|
||||
}
|
||||
|
||||
static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bpos pos,
|
||||
@ -1064,6 +1269,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
struct btree_iter inode_iter;
|
||||
struct bkey_s_c k;
|
||||
struct bch_inode_unpacked inode;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret;
|
||||
|
||||
k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
|
||||
@ -1099,6 +1305,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
pos.offset, pos.snapshot))
|
||||
goto delete;
|
||||
|
||||
if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
|
||||
trans, deleted_inode_has_child_snapshots,
|
||||
"inode with child snapshots %llu:%u in deleted_inodes btree",
|
||||
pos.offset, pos.snapshot))
|
||||
goto delete;
|
||||
|
||||
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (ret) {
|
||||
if (fsck_err(trans, inode_has_child_snapshots_wrong,
|
||||
"inode has_child_snapshots flag wrong (should be set)\n%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_inode_unpacked_to_text(&buf, &inode),
|
||||
buf.buf))) {
|
||||
inode.bi_flags |= BCH_INODE_has_child_snapshot;
|
||||
ret = __bch2_fsck_write_inode(trans, &inode);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
goto delete;
|
||||
|
||||
}
|
||||
|
||||
if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
|
||||
!fsck_err(trans, deleted_inode_but_clean,
|
||||
"filesystem marked as clean but have deleted inode %llu:%u",
|
||||
@ -1107,33 +1338,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
|
||||
struct bpos new_min_pos;
|
||||
|
||||
ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
inode.bi_flags &= ~BCH_INODE_unlinked;
|
||||
|
||||
ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
|
||||
BTREE_UPDATE_internal_snapshot_node);
|
||||
bch_err_msg(c, ret, "clearing inode unlinked flag");
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* We'll need another write buffer flush to pick up the new
|
||||
* unlinked inodes in the snapshot leaves:
|
||||
*/
|
||||
*need_another_pass = true;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 1;
|
||||
out:
|
||||
fsck_err:
|
||||
bch2_trans_iter_exit(trans, &inode_iter);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
delete:
|
||||
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "bkey.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "opts.h"
|
||||
#include "snapshot.h"
|
||||
|
||||
enum bch_validate_flags;
|
||||
extern const char * const bch2_inode_opts[];
|
||||
@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
|
||||
|
||||
static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
|
||||
{
|
||||
return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
|
||||
? __bch2_inode_has_child_snapshots(trans, pos)
|
||||
: 0;
|
||||
}
|
||||
|
||||
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
|
||||
struct bkey_s_c, struct bkey_s,
|
||||
enum btree_iter_update_trigger_flags);
|
||||
|
@ -133,7 +133,8 @@ enum inode_opt_id {
|
||||
x(i_size_dirty, 5) \
|
||||
x(i_sectors_dirty, 6) \
|
||||
x(unlinked, 7) \
|
||||
x(backptr_untrusted, 8)
|
||||
x(backptr_untrusted, 8) \
|
||||
x(has_child_snapshot, 9)
|
||||
|
||||
/* bits 20+ reserved for packed fields below: */
|
||||
|
||||
|
@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (closure_wait_event_timeout(&j->async_wait,
|
||||
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
||||
(flags & JOURNAL_RES_GET_NONBLOCK),
|
||||
HZ * 10))
|
||||
return ret;
|
||||
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_journal_debug_to_text(&buf, j);
|
||||
bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
|
||||
buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
|
||||
closure_wait_event(&j->async_wait,
|
||||
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
|
||||
(flags & JOURNAL_RES_GET_NONBLOCK));
|
||||
|
@ -427,7 +427,9 @@ void bch2_opt_to_text(struct printbuf *out,
|
||||
prt_printf(out, "%lli", v);
|
||||
break;
|
||||
case BCH_OPT_STR:
|
||||
if (flags & OPT_SHOW_FULL_LIST)
|
||||
if (v < opt->min || v >= opt->max - 1)
|
||||
prt_printf(out, "(invalid option %lli)", v);
|
||||
else if (flags & OPT_SHOW_FULL_LIST)
|
||||
prt_string_option(out, opt->choices, v);
|
||||
else
|
||||
prt_str(out, opt->choices[v]);
|
||||
|
@ -287,7 +287,8 @@ int bch2_journal_replay(struct bch_fs *c)
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
BCH_TRANS_COMMIT_journal_reclaim|
|
||||
BCH_TRANS_COMMIT_skip_accounting_apply|
|
||||
BCH_TRANS_COMMIT_no_journal_res,
|
||||
BCH_TRANS_COMMIT_no_journal_res|
|
||||
BCH_WATERMARK_reclaim,
|
||||
bch2_journal_replay_accounting_key(trans, k));
|
||||
if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
|
||||
goto err;
|
||||
|
@ -46,6 +46,7 @@
|
||||
x(check_dirents, 27, PASS_FSCK) \
|
||||
x(check_xattrs, 28, PASS_FSCK) \
|
||||
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
|
||||
x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \
|
||||
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
|
||||
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
|
||||
x(check_nlinks, 31, PASS_FSCK) \
|
||||
|
@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
|
||||
prt_printf(out, "]");
|
||||
}
|
||||
|
||||
static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
|
||||
struct bch_sb *sb,
|
||||
struct printbuf *err)
|
||||
static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
|
||||
struct bch_sb *sb,
|
||||
struct printbuf *err)
|
||||
{
|
||||
if (!r->nr_devs) {
|
||||
prt_printf(err, "no devices in entry ");
|
||||
@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
|
||||
struct bch_fs *c,
|
||||
struct printbuf *err)
|
||||
{
|
||||
mutex_lock(&c->sb_lock);
|
||||
int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ret;
|
||||
if (!r->nr_devs) {
|
||||
prt_printf(err, "no devices in entry ");
|
||||
goto bad;
|
||||
}
|
||||
|
||||
if (r->nr_required > 1 &&
|
||||
r->nr_required >= r->nr_devs) {
|
||||
prt_printf(err, "bad nr_required in entry ");
|
||||
goto bad;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < r->nr_devs; i++)
|
||||
if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
|
||||
!bch2_dev_exists(c, r->devs[i])) {
|
||||
prt_printf(err, "invalid device %u in entry ", r->devs[i]);
|
||||
goto bad;
|
||||
}
|
||||
|
||||
return 0;
|
||||
bad:
|
||||
bch2_replicas_entry_to_text(err, r);
|
||||
return -BCH_ERR_invalid_replicas_entry;
|
||||
}
|
||||
|
||||
void bch2_cpu_replicas_to_text(struct printbuf *out,
|
||||
@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
|
||||
struct bch_replicas_entry_v1 *e =
|
||||
cpu_replicas_entry(cpu_r, i);
|
||||
|
||||
int ret = bch2_replicas_entry_validate_locked(e, sb, err);
|
||||
int ret = bch2_replicas_entry_sb_validate(e, sb, err);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
|
||||
|
||||
rcu_read_lock();
|
||||
for (unsigned i = 0; i < e->nr_devs; i++) {
|
||||
if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
|
||||
nr_failed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
nr_online += test_bit(e->devs[i], devs.d);
|
||||
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
|
||||
|
@ -78,7 +78,10 @@
|
||||
BCH_FSCK_ERR_accounting_mismatch) \
|
||||
x(rebalance_work_acct_fix, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
||||
BCH_FSCK_ERR_accounting_mismatch)
|
||||
BCH_FSCK_ERR_accounting_mismatch) \
|
||||
x(inode_has_child_snapshots, \
|
||||
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
|
||||
BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
|
||||
|
||||
#define DOWNGRADE_TABLE() \
|
||||
x(bucket_stripe_sectors, \
|
||||
|
@ -180,6 +180,7 @@ enum bch_fsck_flags {
|
||||
x(reflink_p_to_missing_reflink_v, 166, 0) \
|
||||
x(stripe_pos_bad, 167, 0) \
|
||||
x(stripe_val_size_bad, 168, 0) \
|
||||
x(stripe_csum_granularity_bad, 290, 0) \
|
||||
x(stripe_sector_count_wrong, 169, 0) \
|
||||
x(snapshot_tree_pos_bad, 170, 0) \
|
||||
x(snapshot_tree_to_missing_snapshot, 171, 0) \
|
||||
@ -225,11 +226,13 @@ enum bch_fsck_flags {
|
||||
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
|
||||
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
|
||||
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
|
||||
x(inode_has_child_snapshots_wrong, 287, 0) \
|
||||
x(inode_unreachable, 210, FSCK_AUTOFIX) \
|
||||
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
|
||||
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
|
||||
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
|
||||
x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
|
||||
x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
|
||||
x(extent_overlapping, 215, 0) \
|
||||
x(key_in_missing_inode, 216, 0) \
|
||||
x(key_in_wrong_inode_type, 217, 0) \
|
||||
@ -289,6 +292,7 @@ enum bch_fsck_flags {
|
||||
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
|
||||
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
|
||||
x(accounting_replicas_not_marked, 273, 0) \
|
||||
x(accounting_to_invalid_device, 289, 0) \
|
||||
x(invalid_btree_id, 274, 0) \
|
||||
x(alloc_key_io_time_bad, 275, 0) \
|
||||
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
|
||||
@ -298,7 +302,7 @@ enum bch_fsck_flags {
|
||||
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
|
||||
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
|
||||
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
|
||||
x(MAX, 287, 0)
|
||||
x(MAX, 291, 0)
|
||||
|
||||
enum bch_sb_error_id {
|
||||
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
|
||||
|
@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err,
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
if (m.btree_bitmap_shift >= 64) {
|
||||
prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
|
||||
return -BCH_ERR_invalid_sb_members;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out,
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Btree allocated bitmap blocksize:\t");
|
||||
prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
|
||||
if (m.btree_bitmap_shift < 64)
|
||||
prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
|
||||
else
|
||||
prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
|
||||
prt_newline(out);
|
||||
|
||||
prt_printf(out, "Btree allocated bitmap:\t");
|
||||
|
@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
|
||||
if (bch2_snapshot_equiv(c, id))
|
||||
return 0;
|
||||
|
||||
/* 0 is an invalid tree ID */
|
||||
/* Do we need to reconstruct the snapshot_tree entry as well? */
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
u32 tree_id = 0;
|
||||
int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
|
||||
|
||||
for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
|
||||
0, k, ret) {
|
||||
if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
|
||||
tree_id = k.k->p.offset;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!tree_id) {
|
||||
ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
|
||||
ret = PTR_ERR_OR_ZERO(snapshot);
|
||||
if (ret)
|
||||
@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
|
||||
snapshot->v.tree = cpu_to_le32(tree_id);
|
||||
snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
|
||||
|
||||
for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
|
||||
0, k, ret) {
|
||||
if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
|
||||
snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
|
||||
SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
|
||||
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
|
||||
bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
|
||||
@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
|
||||
{
|
||||
const struct snapshot_t *s = snapshot_t(c, id);
|
||||
|
||||
return s->children[1] ?: s->children[0];
|
||||
}
|
||||
|
||||
static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
|
||||
{
|
||||
u32 child;
|
||||
|
||||
while ((child = bch2_snapshot_smallest_child(c, id)))
|
||||
id = child;
|
||||
return id;
|
||||
}
|
||||
|
||||
static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
|
||||
enum btree_id btree,
|
||||
struct bkey_s_c interior_k,
|
||||
u32 leaf_id, struct bpos *new_min_pos)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bpos pos = interior_k.k->p;
|
||||
struct bkey_s_c k;
|
||||
struct bkey_i *new;
|
||||
int ret;
|
||||
|
||||
pos.snapshot = leaf_id;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
|
||||
k = bch2_btree_iter_peek_slot(&iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* key already overwritten in this snapshot? */
|
||||
if (k.k->p.snapshot != interior_k.k->p.snapshot)
|
||||
goto out;
|
||||
|
||||
if (bpos_eq(*new_min_pos, POS_MIN)) {
|
||||
*new_min_pos = k.k->p;
|
||||
new_min_pos->snapshot = leaf_id;
|
||||
}
|
||||
|
||||
new = bch2_bkey_make_mut_noupdate(trans, interior_k);
|
||||
ret = PTR_ERR_OR_ZERO(new);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
new->k.p.snapshot = leaf_id;
|
||||
ret = bch2_trans_update(trans, &iter, new, 0);
|
||||
out:
|
||||
bch2_set_btree_iter_dontneed(&iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
|
||||
enum btree_id btree,
|
||||
struct bkey_s_c k,
|
||||
struct bpos *new_min_pos)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_buf sk;
|
||||
u32 restart_count = trans->restart_count;
|
||||
int ret = 0;
|
||||
|
||||
bch2_bkey_buf_init(&sk);
|
||||
bch2_bkey_buf_reassemble(&sk, c, k);
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
*new_min_pos = POS_MIN;
|
||||
|
||||
for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
|
||||
id < k.k->p.snapshot;
|
||||
id++) {
|
||||
if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
|
||||
!bch2_snapshot_is_leaf(c, id))
|
||||
continue;
|
||||
again:
|
||||
ret = btree_trans_too_many_iters(trans) ?:
|
||||
bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL, 0);
|
||||
if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
||||
bch2_trans_begin(trans);
|
||||
goto again;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_bkey_buf_exit(&sk, c);
|
||||
|
||||
return ret ?: trans_was_restarted(trans, restart_count);
|
||||
}
|
||||
|
||||
static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
|
||||
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
|
||||
}
|
||||
|
||||
int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
|
||||
struct bkey_s_c, struct bpos *);
|
||||
|
||||
int bch2_snapshots_read(struct bch_fs *);
|
||||
void bch2_fs_snapshots_exit(struct bch_fs *);
|
||||
|
||||
|
@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock);
|
||||
|
||||
DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
|
||||
|
||||
static void bch2_dev_unlink(struct bch_dev *);
|
||||
static void bch2_dev_free(struct bch_dev *);
|
||||
static int bch2_dev_alloc(struct bch_fs *, unsigned);
|
||||
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
|
||||
@ -620,9 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c)
|
||||
up_write(&c->state_lock);
|
||||
|
||||
for_each_member_device(c, ca)
|
||||
if (ca->kobj.state_in_sysfs &&
|
||||
ca->disk_sb.bdev)
|
||||
sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
|
||||
bch2_dev_unlink(ca);
|
||||
|
||||
if (c->kobj.state_in_sysfs)
|
||||
kobject_del(&c->kobj);
|
||||
@ -1187,9 +1186,7 @@ static void bch2_dev_free(struct bch_dev *ca)
|
||||
{
|
||||
cancel_work_sync(&ca->io_error_work);
|
||||
|
||||
if (ca->kobj.state_in_sysfs &&
|
||||
ca->disk_sb.bdev)
|
||||
sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
|
||||
bch2_dev_unlink(ca);
|
||||
|
||||
if (ca->kobj.state_in_sysfs)
|
||||
kobject_del(&ca->kobj);
|
||||
@ -1226,10 +1223,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
|
||||
percpu_ref_kill(&ca->io_ref);
|
||||
wait_for_completion(&ca->io_ref_completion);
|
||||
|
||||
if (ca->kobj.state_in_sysfs) {
|
||||
sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
|
||||
sysfs_remove_link(&ca->kobj, "block");
|
||||
}
|
||||
bch2_dev_unlink(ca);
|
||||
|
||||
bch2_free_super(&ca->disk_sb);
|
||||
bch2_dev_journal_exit(ca);
|
||||
@ -1251,6 +1245,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
|
||||
complete(&ca->io_ref_completion);
|
||||
}
|
||||
|
||||
static void bch2_dev_unlink(struct bch_dev *ca)
|
||||
{
|
||||
struct kobject *b;
|
||||
|
||||
/*
|
||||
* This is racy w.r.t. the underlying block device being hot-removed,
|
||||
* which removes it from sysfs.
|
||||
*
|
||||
* It'd be lovely if we had a way to handle this race, but the sysfs
|
||||
* code doesn't appear to provide a good method and block/holder.c is
|
||||
* susceptible as well:
|
||||
*/
|
||||
if (ca->kobj.state_in_sysfs &&
|
||||
ca->disk_sb.bdev &&
|
||||
(b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
|
||||
sysfs_remove_link(b, "bcachefs");
|
||||
sysfs_remove_link(&ca->kobj, "block");
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
int ret;
|
||||
|
@ -454,4 +454,39 @@ do { \
|
||||
__closure_wait_event(waitlist, _cond); \
|
||||
} while (0)
|
||||
|
||||
#define __closure_wait_event_timeout(waitlist, _cond, _until) \
|
||||
({ \
|
||||
struct closure cl; \
|
||||
long _t; \
|
||||
\
|
||||
closure_init_stack(&cl); \
|
||||
\
|
||||
while (1) { \
|
||||
closure_wait(waitlist, &cl); \
|
||||
if (_cond) { \
|
||||
_t = max_t(long, 1L, _until - jiffies); \
|
||||
break; \
|
||||
} \
|
||||
_t = max_t(long, 0L, _until - jiffies); \
|
||||
if (!_t) \
|
||||
break; \
|
||||
closure_sync_timeout(&cl, _t); \
|
||||
} \
|
||||
closure_wake_up(waitlist); \
|
||||
closure_sync(&cl); \
|
||||
_t; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
|
||||
* condition became true
|
||||
*/
|
||||
#define closure_wait_event_timeout(waitlist, _cond, _timeout) \
|
||||
({ \
|
||||
unsigned long _until = jiffies + _timeout; \
|
||||
(_cond) \
|
||||
? max_t(long, 1L, _until - jiffies) \
|
||||
: __closure_wait_event_timeout(waitlist, _cond, _until);\
|
||||
})
|
||||
|
||||
#endif /* _LINUX_CLOSURE_H */
|
||||
|
Loading…
Reference in New Issue
Block a user