bcachefs: KEY_TYPE_accounting

New key type for the disk space accounting rewrite.

 - Holds a variable sized array of u64s (may be more than one for
   accounting e.g. compressed and uncompressed size, or buckets and
   sectors for a given data type)

 - Updates are deltas, not new versions of the key: this means updates
   to accounting can happen via the btree write buffer, which we'll be
   teaching to accumulate deltas.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-12-27 18:31:46 -05:00
parent 929d954330
commit 2744e5c9eb
8 changed files with 303 additions and 49 deletions

View File

@ -29,10 +29,11 @@ bcachefs-y := \
clock.o \
compress.o \
darray.o \
data_update.o \
debug.o \
dirent.o \
disk_accounting.o \
disk_groups.o \
data_update.o \
ec.o \
errcode.o \
error.o \

View File

@ -417,7 +417,8 @@ static inline void bkey_init(struct bkey *k)
x(bucket_gens, 30) \
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33)
x(logged_op_finsert, 33) \
x(accounting, 34)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@ -505,6 +506,9 @@ struct bch_sb_field {
x(downgrade, 14)
#include "alloc_background_format.h"
#include "dirent_format.h"
#include "disk_accounting_format.h"
#include "disk_groups_format.h"
#include "extents_format.h"
#include "ec_format.h"
#include "dirent_format.h"
@ -602,49 +606,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
#define BCH_DATA_TYPES() \
x(free, 0) \
x(sb, 1) \
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
x(cached, 5) \
x(parity, 6) \
x(stripe, 7) \
x(need_gc_gens, 8) \
x(need_discard, 9) \
x(unstriped, 10)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
BCH_DATA_TYPES()
#undef x
BCH_DATA_NR
};
static inline bool data_type_is_empty(enum bch_data_type type)
{
switch (type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
return true;
default:
return false;
}
}
static inline bool data_type_is_hidden(enum bch_data_type type)
{
switch (type) {
case BCH_DATA_sb:
case BCH_DATA_journal:
return true;
default:
return false;
}
}
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@ -724,7 +685,8 @@ struct bch_sb_field_ext {
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
x(btree_subvolume_children, BCH_VERSION(1, 6)) \
x(mi_btree_bitmap, BCH_VERSION(1, 7)) \
x(bucket_stripe_sectors, BCH_VERSION(1, 8))
x(bucket_stripe_sectors, BCH_VERSION(1, 8)) \
x(disk_accounting_v2, BCH_VERSION(1, 9))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -1377,7 +1339,9 @@ enum btree_id_flags {
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
BIT_ULL(KEY_TYPE_set))
BIT_ULL(KEY_TYPE_set)) \
x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_accounting)) \
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,

View File

@ -7,6 +7,7 @@
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
#include "disk_accounting.h"
#include "ec.h"
#include "error.h"
#include "extents.h"

View File

@ -0,0 +1,70 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
#include "buckets.h"
#include "disk_accounting.h"
#include "replicas.h"
static const char * const disk_accounting_type_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_DISK_ACCOUNTING_TYPES()
#undef x
NULL
};
int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags,
struct printbuf *err)
{
return 0;
}
void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
{
if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
prt_printf(out, "unknown type %u", k->type);
return;
}
prt_str(out, disk_accounting_type_strs[k->type]);
prt_str(out, " ");
switch (k->type) {
case BCH_DISK_ACCOUNTING_nr_inodes:
break;
case BCH_DISK_ACCOUNTING_persistent_reserved:
prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas);
break;
case BCH_DISK_ACCOUNTING_replicas:
bch2_replicas_entry_to_text(out, &k->replicas);
break;
case BCH_DISK_ACCOUNTING_dev_data_type:
prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev);
bch2_prt_data_type(out, k->dev_data_type.data_type);
break;
case BCH_DISK_ACCOUNTING_dev_stripe_buckets:
prt_printf(out, "dev=%u", k->dev_stripe_buckets.dev);
break;
}
}
void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k);
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
bch2_accounting_key_to_text(out, &acc_k);
for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
prt_printf(out, " %lli", acc.v->d[i]);
}
void bch2_accounting_swab(struct bkey_s k)
{
for (u64 *p = (u64 *) k.v;
p < (u64 *) bkey_val_end(k);
p++)
*p = swab64(*p);
}

View File

@ -0,0 +1,52 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_ACCOUNTING_H
#define _BCACHEFS_DISK_ACCOUNTING_H
static inline unsigned bch2_accounting_counters(const struct bkey *k)
{
return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64);
}
static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
struct bkey_s_c_accounting src)
{
EBUG_ON(dst->k.u64s != src.k->u64s);
for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
dst->v.d[i] += src.v->d[i];
if (bversion_cmp(dst->k.version, src.k->version) < 0)
dst->k.version = src.k->version;
}
static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
{
acc->_pad = p;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
bch2_bpos_swab(&acc->_pad);
#endif
}
static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
{
struct bpos ret = k->_pad;
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
bch2_bpos_swab(&ret);
#endif
return ret;
}
int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
enum bch_validate_flags, struct printbuf *);
void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_accounting_swab(struct bkey_s);
#define bch2_bkey_ops_accounting ((struct bkey_ops) { \
.key_invalid = bch2_accounting_invalid, \
.val_to_text = bch2_accounting_to_text, \
.swab = bch2_accounting_swab, \
.min_val_size = 8, \
})
#endif /* _BCACHEFS_DISK_ACCOUNTING_H */

View File

@ -0,0 +1,144 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
#define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
#include "replicas_format.h"
/*
* Disk accounting - KEY_TYPE_accounting - on disk format:
*
* Here, the key has considerably more structure than a typical key (bpos); an
* accounting key is 'struct disk_accounting_pos', which is a union of bpos.
*
* More specifically: a key is just a muliword integer (where word endianness
* matches native byte order), so we're treating bpos as an opaque 20 byte
* integer and mapping bch_accounting_key to that.
*
* This is a type-tagged union of all our various subtypes; a disk accounting
* key can be device counters, replicas counters, et cetera - it's extensible.
*
* The value is a list of u64s or s64s; the number of counters is specific to a
* given accounting type.
*
* Unlike with other key types, updates are _deltas_, and the deltas are not
* resolved until the update to the underlying btree, done by btree write buffer
* flush or journal replay.
*
* Journal replay in particular requires special handling. The journal tracks a
* range of entries which may possibly have not yet been applied to the btree
* yet - it does not know definitively whether individual entries are dirty and
* still need to be applied.
*
* To handle this, we use the version field of struct bkey, and give every
* accounting update a unique version number - a total ordering in time; the
* version number is derived from the key's position in the journal. Then
* journal replay can compare the version number of the key from the journal
* with the version number of the key in the btree to determine if a key needs
* to be replayed.
*
* For this to work, we must maintain this strict time ordering of updates as
* they are flushed to the btree, both via write buffer flush and via journal
* replay. This has complications for the write buffer code while journal replay
* is still in progress; the write buffer cannot flush any accounting keys to
* the btree until journal replay has finished replaying its accounting keys, or
* the (newer) version number of the keys from the write buffer will cause
* updates from journal replay to be lost.
*/
struct bch_accounting {
struct bch_val v;
__u64 d[];
};
#define BCH_ACCOUNTING_MAX_COUNTERS 3
#define BCH_DATA_TYPES() \
x(free, 0) \
x(sb, 1) \
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
x(cached, 5) \
x(parity, 6) \
x(stripe, 7) \
x(need_gc_gens, 8) \
x(need_discard, 9) \
x(unstriped, 10)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,
BCH_DATA_TYPES()
#undef x
BCH_DATA_NR
};
static inline bool data_type_is_empty(enum bch_data_type type)
{
switch (type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
return true;
default:
return false;
}
}
static inline bool data_type_is_hidden(enum bch_data_type type)
{
switch (type) {
case BCH_DATA_sb:
case BCH_DATA_journal:
return true;
default:
return false;
}
}
#define BCH_DISK_ACCOUNTING_TYPES() \
x(nr_inodes, 0) \
x(persistent_reserved, 1) \
x(replicas, 2) \
x(dev_data_type, 3) \
x(dev_stripe_buckets, 4)
enum disk_accounting_type {
#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
BCH_DISK_ACCOUNTING_TYPES()
#undef x
BCH_DISK_ACCOUNTING_TYPE_NR,
};
struct bch_nr_inodes {
};
struct bch_persistent_reserved {
__u8 nr_replicas;
};
struct bch_dev_data_type {
__u8 dev;
__u8 data_type;
};
struct bch_dev_stripe_buckets {
__u8 dev;
};
struct disk_accounting_pos {
union {
struct {
__u8 type;
union {
struct bch_nr_inodes nr_inodes;
struct bch_persistent_reserved persistent_reserved;
struct bch_replicas_entry_v1 replicas;
struct bch_dev_data_type dev_data_type;
struct bch_dev_stripe_buckets dev_stripe_buckets;
};
};
struct bpos _pad;
};
};
#endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */

View File

@ -90,6 +90,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
bch2_write_super(c);

View File

@ -54,11 +54,32 @@
BCH_FSCK_ERR_subvol_children_not_set) \
x(mi_btree_bitmap, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_btree_bitmap_not_marked)
BCH_FSCK_ERR_btree_bitmap_not_marked) \
x(disk_accounting_v2, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_bkey_version_in_future, \
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
BCH_FSCK_ERR_accounting_mismatch)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
0)
0) \
x(disk_accounting_v2, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
BCH_FSCK_ERR_fs_usage_hidden_wrong, \
BCH_FSCK_ERR_fs_usage_btree_wrong, \
BCH_FSCK_ERR_fs_usage_data_wrong, \
BCH_FSCK_ERR_fs_usage_cached_wrong, \
BCH_FSCK_ERR_fs_usage_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
BCH_FSCK_ERR_bkey_version_in_future)
struct upgrade_downgrade_entry {
u64 recovery_passes;