mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-13 14:24:11 +08:00
bcachefs: Initial commit
Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write filesystem with every feature you could possibly want. Website: https://bcachefs.org Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
0d29a833b7
commit
1c6fdbd8f2
@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
|
||||
source "fs/btrfs/Kconfig"
|
||||
source "fs/nilfs2/Kconfig"
|
||||
source "fs/f2fs/Kconfig"
|
||||
source "fs/bcachefs/Kconfig"
|
||||
source "fs/zonefs/Kconfig"
|
||||
|
||||
endif # BLOCK
|
||||
|
@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
|
||||
obj-$(CONFIG_BTRFS_FS) += btrfs/
|
||||
obj-$(CONFIG_GFS2_FS) += gfs2/
|
||||
obj-$(CONFIG_F2FS_FS) += f2fs/
|
||||
obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
|
||||
obj-$(CONFIG_CEPH_FS) += ceph/
|
||||
obj-$(CONFIG_PSTORE) += pstore/
|
||||
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
|
||||
|
52
fs/bcachefs/Kconfig
Normal file
52
fs/bcachefs/Kconfig
Normal file
@ -0,0 +1,52 @@
|
||||
|
||||
config BCACHEFS_FS
|
||||
tristate "bcachefs filesystem support"
|
||||
depends on BLOCK
|
||||
select EXPORTFS
|
||||
select CLOSURES
|
||||
select LIBCRC32C
|
||||
select FS_POSIX_ACL
|
||||
select LZ4_COMPRESS
|
||||
select LZ4_DECOMPRESS
|
||||
select ZLIB_DEFLATE
|
||||
select ZLIB_INFLATE
|
||||
select ZSTD_COMPRESS
|
||||
select ZSTD_DECOMPRESS
|
||||
select CRYPTO_SHA256
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_POLY1305
|
||||
select KEYS
|
||||
help
|
||||
The bcachefs filesystem - a modern, copy on write filesystem, with
|
||||
support for multiple devices, compression, checksumming, etc.
|
||||
|
||||
config BCACHEFS_QUOTA
|
||||
bool "bcachefs quota support"
|
||||
depends on BCACHEFS_FS
|
||||
select QUOTACTL
|
||||
|
||||
config BCACHEFS_POSIX_ACL
|
||||
bool "bcachefs POSIX ACL support"
|
||||
depends on BCACHEFS_FS
|
||||
select FS_POSIX_ACL
|
||||
|
||||
config BCACHEFS_DEBUG
|
||||
bool "bcachefs debugging"
|
||||
depends on BCACHEFS_FS
|
||||
help
|
||||
Enables many extra debugging checks and assertions.
|
||||
|
||||
The resulting code will be significantly slower than normal; you
|
||||
probably shouldn't select this option unless you're a developer.
|
||||
|
||||
config BCACHEFS_TESTS
|
||||
bool "bcachefs unit and performance tests"
|
||||
depends on BCACHEFS_FS
|
||||
help
|
||||
Include some unit and performance tests for the core btree code
|
||||
|
||||
config BCACHEFS_NO_LATENCY_ACCT
|
||||
bool "disable latency accounting and time stats"
|
||||
depends on BCACHEFS_FS
|
||||
help
|
||||
This disables device latency tracking and time stats, only for performance testing
|
53
fs/bcachefs/Makefile
Normal file
53
fs/bcachefs/Makefile
Normal file
@ -0,0 +1,53 @@
|
||||
|
||||
obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
|
||||
|
||||
bcachefs-y := \
|
||||
acl.o \
|
||||
alloc.o \
|
||||
bkey.o \
|
||||
bkey_methods.o \
|
||||
bset.o \
|
||||
btree_cache.o \
|
||||
btree_gc.o \
|
||||
btree_io.o \
|
||||
btree_iter.o \
|
||||
btree_update_interior.o \
|
||||
btree_update_leaf.o \
|
||||
buckets.o \
|
||||
chardev.o \
|
||||
checksum.o \
|
||||
clock.o \
|
||||
compress.o \
|
||||
debug.o \
|
||||
dirent.o \
|
||||
disk_groups.o \
|
||||
error.o \
|
||||
extents.o \
|
||||
fs.o \
|
||||
fs-ioctl.o \
|
||||
fs-io.o \
|
||||
fsck.o \
|
||||
inode.o \
|
||||
io.o \
|
||||
journal.o \
|
||||
journal_io.o \
|
||||
journal_reclaim.o \
|
||||
journal_seq_blacklist.o \
|
||||
keylist.o \
|
||||
migrate.o \
|
||||
move.o \
|
||||
movinggc.o \
|
||||
opts.o \
|
||||
quota.o \
|
||||
rebalance.o \
|
||||
recovery.o \
|
||||
replicas.o \
|
||||
siphash.o \
|
||||
six.o \
|
||||
super.o \
|
||||
super-io.o \
|
||||
sysfs.o \
|
||||
tests.o \
|
||||
trace.o \
|
||||
util.o \
|
||||
xattr.o
|
387
fs/bcachefs/acl.c
Normal file
387
fs/bcachefs/acl.c
Normal file
@ -0,0 +1,387 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
||||
|
||||
#include "bcachefs.h"
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/posix_acl.h>
|
||||
#include <linux/posix_acl_xattr.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "acl.h"
|
||||
#include "fs.h"
|
||||
#include "xattr.h"
|
||||
|
||||
static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
|
||||
{
|
||||
return sizeof(bch_acl_header) +
|
||||
sizeof(bch_acl_entry_short) * nr_short +
|
||||
sizeof(bch_acl_entry) * nr_long;
|
||||
}
|
||||
|
||||
static inline int acl_to_xattr_type(int type)
|
||||
{
|
||||
switch (type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
|
||||
case ACL_TYPE_DEFAULT:
|
||||
return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert from filesystem to in-memory representation.
|
||||
*/
|
||||
static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
|
||||
{
|
||||
const void *p, *end = value + size;
|
||||
struct posix_acl *acl;
|
||||
struct posix_acl_entry *out;
|
||||
unsigned count = 0;
|
||||
|
||||
if (!value)
|
||||
return NULL;
|
||||
if (size < sizeof(bch_acl_header))
|
||||
goto invalid;
|
||||
if (((bch_acl_header *)value)->a_version !=
|
||||
cpu_to_le32(BCH_ACL_VERSION))
|
||||
goto invalid;
|
||||
|
||||
p = value + sizeof(bch_acl_header);
|
||||
while (p < end) {
|
||||
const bch_acl_entry *entry = p;
|
||||
|
||||
if (p + sizeof(bch_acl_entry_short) > end)
|
||||
goto invalid;
|
||||
|
||||
switch (le16_to_cpu(entry->e_tag)) {
|
||||
case ACL_USER_OBJ:
|
||||
case ACL_GROUP_OBJ:
|
||||
case ACL_MASK:
|
||||
case ACL_OTHER:
|
||||
p += sizeof(bch_acl_entry_short);
|
||||
break;
|
||||
case ACL_USER:
|
||||
case ACL_GROUP:
|
||||
p += sizeof(bch_acl_entry);
|
||||
break;
|
||||
default:
|
||||
goto invalid;
|
||||
}
|
||||
|
||||
count++;
|
||||
}
|
||||
|
||||
if (p > end)
|
||||
goto invalid;
|
||||
|
||||
if (!count)
|
||||
return NULL;
|
||||
|
||||
acl = posix_acl_alloc(count, GFP_KERNEL);
|
||||
if (!acl)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
out = acl->a_entries;
|
||||
|
||||
p = value + sizeof(bch_acl_header);
|
||||
while (p < end) {
|
||||
const bch_acl_entry *in = p;
|
||||
|
||||
out->e_tag = le16_to_cpu(in->e_tag);
|
||||
out->e_perm = le16_to_cpu(in->e_perm);
|
||||
|
||||
switch (out->e_tag) {
|
||||
case ACL_USER_OBJ:
|
||||
case ACL_GROUP_OBJ:
|
||||
case ACL_MASK:
|
||||
case ACL_OTHER:
|
||||
p += sizeof(bch_acl_entry_short);
|
||||
break;
|
||||
case ACL_USER:
|
||||
out->e_uid = make_kuid(&init_user_ns,
|
||||
le32_to_cpu(in->e_id));
|
||||
p += sizeof(bch_acl_entry);
|
||||
break;
|
||||
case ACL_GROUP:
|
||||
out->e_gid = make_kgid(&init_user_ns,
|
||||
le32_to_cpu(in->e_id));
|
||||
p += sizeof(bch_acl_entry);
|
||||
break;
|
||||
}
|
||||
|
||||
out++;
|
||||
}
|
||||
|
||||
BUG_ON(out != acl->a_entries + acl->a_count);
|
||||
|
||||
return acl;
|
||||
invalid:
|
||||
pr_err("invalid acl entry");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
#define acl_for_each_entry(acl, acl_e) \
|
||||
for (acl_e = acl->a_entries; \
|
||||
acl_e < acl->a_entries + acl->a_count; \
|
||||
acl_e++)
|
||||
|
||||
/*
|
||||
* Convert from in-memory to filesystem representation.
|
||||
*/
|
||||
static struct bkey_i_xattr *
|
||||
bch2_acl_to_xattr(struct btree_trans *trans,
|
||||
const struct posix_acl *acl,
|
||||
int type)
|
||||
{
|
||||
struct bkey_i_xattr *xattr;
|
||||
bch_acl_header *acl_header;
|
||||
const struct posix_acl_entry *acl_e;
|
||||
void *outptr;
|
||||
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
|
||||
|
||||
acl_for_each_entry(acl, acl_e) {
|
||||
switch (acl_e->e_tag) {
|
||||
case ACL_USER:
|
||||
case ACL_GROUP:
|
||||
nr_long++;
|
||||
break;
|
||||
case ACL_USER_OBJ:
|
||||
case ACL_GROUP_OBJ:
|
||||
case ACL_MASK:
|
||||
case ACL_OTHER:
|
||||
nr_short++;
|
||||
break;
|
||||
default:
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
}
|
||||
|
||||
acl_len = bch2_acl_size(nr_short, nr_long);
|
||||
u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
|
||||
|
||||
if (u64s > U8_MAX)
|
||||
return ERR_PTR(-E2BIG);
|
||||
|
||||
xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
|
||||
if (IS_ERR(xattr))
|
||||
return xattr;
|
||||
|
||||
bkey_xattr_init(&xattr->k_i);
|
||||
xattr->k.u64s = u64s;
|
||||
xattr->v.x_type = acl_to_xattr_type(type);
|
||||
xattr->v.x_name_len = 0,
|
||||
xattr->v.x_val_len = cpu_to_le16(acl_len);
|
||||
|
||||
acl_header = xattr_val(&xattr->v);
|
||||
acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
|
||||
|
||||
outptr = (void *) acl_header + sizeof(*acl_header);
|
||||
|
||||
acl_for_each_entry(acl, acl_e) {
|
||||
bch_acl_entry *entry = outptr;
|
||||
|
||||
entry->e_tag = cpu_to_le16(acl_e->e_tag);
|
||||
entry->e_perm = cpu_to_le16(acl_e->e_perm);
|
||||
switch (acl_e->e_tag) {
|
||||
case ACL_USER:
|
||||
entry->e_id = cpu_to_le32(
|
||||
from_kuid(&init_user_ns, acl_e->e_uid));
|
||||
outptr += sizeof(bch_acl_entry);
|
||||
break;
|
||||
case ACL_GROUP:
|
||||
entry->e_id = cpu_to_le32(
|
||||
from_kgid(&init_user_ns, acl_e->e_gid));
|
||||
outptr += sizeof(bch_acl_entry);
|
||||
break;
|
||||
|
||||
case ACL_USER_OBJ:
|
||||
case ACL_GROUP_OBJ:
|
||||
case ACL_MASK:
|
||||
case ACL_OTHER:
|
||||
outptr += sizeof(bch_acl_entry_short);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
|
||||
|
||||
return xattr;
|
||||
}
|
||||
|
||||
struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
|
||||
struct dentry *dentry, int type)
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct btree_trans trans;
|
||||
struct btree_iter *iter;
|
||||
struct bkey_s_c_xattr xattr;
|
||||
struct posix_acl *acl = NULL;
|
||||
|
||||
bch2_trans_init(&trans, c);
|
||||
retry:
|
||||
bch2_trans_begin(&trans);
|
||||
|
||||
iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
|
||||
&inode->ei_str_hash, inode->v.i_ino,
|
||||
&X_SEARCH(acl_to_xattr_type(type), "", 0),
|
||||
0);
|
||||
if (IS_ERR(iter)) {
|
||||
if (PTR_ERR(iter) == -EINTR)
|
||||
goto retry;
|
||||
|
||||
if (PTR_ERR(iter) != -ENOENT)
|
||||
acl = ERR_CAST(iter);
|
||||
goto out;
|
||||
}
|
||||
|
||||
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
|
||||
|
||||
acl = bch2_acl_from_disk(xattr_val(xattr.v),
|
||||
le16_to_cpu(xattr.v->x_val_len));
|
||||
|
||||
if (!IS_ERR(acl))
|
||||
set_cached_acl(&inode->v, type, acl);
|
||||
out:
|
||||
bch2_trans_exit(&trans);
|
||||
return acl;
|
||||
}
|
||||
|
||||
int bch2_set_acl_trans(struct btree_trans *trans,
|
||||
struct bch_inode_unpacked *inode_u,
|
||||
const struct bch_hash_info *hash_info,
|
||||
struct posix_acl *acl, int type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (type == ACL_TYPE_DEFAULT &&
|
||||
!S_ISDIR(inode_u->bi_mode))
|
||||
return acl ? -EACCES : 0;
|
||||
|
||||
if (acl) {
|
||||
struct bkey_i_xattr *xattr =
|
||||
bch2_acl_to_xattr(trans, acl, type);
|
||||
if (IS_ERR(xattr))
|
||||
return PTR_ERR(xattr);
|
||||
|
||||
ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
|
||||
inode_u->bi_inum, &xattr->k_i, 0);
|
||||
} else {
|
||||
struct xattr_search_key search =
|
||||
X_SEARCH(acl_to_xattr_type(type), "", 0);
|
||||
|
||||
ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
|
||||
inode_u->bi_inum, &search);
|
||||
}
|
||||
|
||||
return ret == -ENOENT ? 0 : ret;
|
||||
}
|
||||
|
||||
static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
|
||||
struct bch_inode_unpacked *bi,
|
||||
void *p)
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct timespec64 now = current_time(&inode->v);
|
||||
umode_t mode = (unsigned long) p;
|
||||
|
||||
bi->bi_ctime = timespec_to_bch2_time(c, now);
|
||||
bi->bi_mode = mode;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_set_acl(struct mnt_idmap *idmap,
|
||||
struct dentry *dentry,
|
||||
struct posix_acl *acl, int type)
|
||||
{
|
||||
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct btree_trans trans;
|
||||
struct bch_inode_unpacked inode_u;
|
||||
umode_t mode = inode->v.i_mode;
|
||||
int ret;
|
||||
|
||||
if (type == ACL_TYPE_ACCESS && acl) {
|
||||
ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bch2_trans_init(&trans, c);
|
||||
retry:
|
||||
bch2_trans_begin(&trans);
|
||||
|
||||
ret = bch2_set_acl_trans(&trans,
|
||||
&inode->ei_inode,
|
||||
&inode->ei_str_hash,
|
||||
acl, type) ?:
|
||||
bch2_write_inode_trans(&trans, inode, &inode_u,
|
||||
inode_update_for_set_acl_fn,
|
||||
(void *)(unsigned long) mode) ?:
|
||||
bch2_trans_commit(&trans, NULL, NULL,
|
||||
&inode->ei_journal_seq,
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOUNLOCK);
|
||||
if (ret == -EINTR)
|
||||
goto retry;
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
bch2_inode_update_after_write(c, inode, &inode_u,
|
||||
ATTR_CTIME|ATTR_MODE);
|
||||
|
||||
set_cached_acl(&inode->v, type, acl);
|
||||
err:
|
||||
bch2_trans_exit(&trans);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_acl_chmod(struct btree_trans *trans,
|
||||
struct bch_inode_info *inode,
|
||||
umode_t mode,
|
||||
struct posix_acl **new_acl)
|
||||
{
|
||||
struct btree_iter *iter;
|
||||
struct bkey_s_c_xattr xattr;
|
||||
struct bkey_i_xattr *new;
|
||||
struct posix_acl *acl;
|
||||
int ret = 0;
|
||||
|
||||
iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
|
||||
&inode->ei_str_hash, inode->v.i_ino,
|
||||
&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
|
||||
BTREE_ITER_INTENT);
|
||||
if (IS_ERR(iter))
|
||||
return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
|
||||
|
||||
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
|
||||
|
||||
acl = bch2_acl_from_disk(xattr_val(xattr.v),
|
||||
le16_to_cpu(xattr.v->x_val_len));
|
||||
if (IS_ERR_OR_NULL(acl))
|
||||
return PTR_ERR(acl);
|
||||
|
||||
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
|
||||
if (IS_ERR(new)) {
|
||||
ret = PTR_ERR(new);
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch2_trans_update(trans, iter, &new->k_i, 0);
|
||||
*new_acl = acl;
|
||||
acl = NULL;
|
||||
err:
|
||||
kfree(acl);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
|
59
fs/bcachefs/acl.h
Normal file
59
fs/bcachefs/acl.h
Normal file
@ -0,0 +1,59 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_ACL_H
|
||||
#define _BCACHEFS_ACL_H
|
||||
|
||||
struct bch_inode_unpacked;
|
||||
struct bch_hash_info;
|
||||
struct bch_inode_info;
|
||||
struct posix_acl;
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
||||
|
||||
#define BCH_ACL_VERSION 0x0001
|
||||
|
||||
typedef struct {
|
||||
__le16 e_tag;
|
||||
__le16 e_perm;
|
||||
__le32 e_id;
|
||||
} bch_acl_entry;
|
||||
|
||||
typedef struct {
|
||||
__le16 e_tag;
|
||||
__le16 e_perm;
|
||||
} bch_acl_entry_short;
|
||||
|
||||
typedef struct {
|
||||
__le32 a_version;
|
||||
} bch_acl_header;
|
||||
|
||||
struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
|
||||
|
||||
int bch2_set_acl_trans(struct btree_trans *,
|
||||
struct bch_inode_unpacked *,
|
||||
const struct bch_hash_info *,
|
||||
struct posix_acl *, int);
|
||||
int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
|
||||
int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
|
||||
umode_t, struct posix_acl **);
|
||||
|
||||
#else
|
||||
|
||||
static inline int bch2_set_acl_trans(struct btree_trans *trans,
|
||||
struct bch_inode_unpacked *inode_u,
|
||||
const struct bch_hash_info *hash_info,
|
||||
struct posix_acl *acl, int type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int bch2_acl_chmod(struct btree_trans *trans,
|
||||
struct bch_inode_info *inode,
|
||||
umode_t mode,
|
||||
struct posix_acl **new_acl)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
|
||||
|
||||
#endif /* _BCACHEFS_ACL_H */
|
2205
fs/bcachefs/alloc.c
Normal file
2205
fs/bcachefs/alloc.c
Normal file
File diff suppressed because it is too large
Load Diff
141
fs/bcachefs/alloc.h
Normal file
141
fs/bcachefs/alloc.h
Normal file
@ -0,0 +1,141 @@
|
||||
#ifndef _BCACHEFS_ALLOC_H
|
||||
#define _BCACHEFS_ALLOC_H
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc_types.h"
|
||||
|
||||
struct bkey;
|
||||
struct bch_dev;
|
||||
struct bch_fs;
|
||||
struct bch_devs_List;
|
||||
|
||||
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
|
||||
|
||||
#define bch2_bkey_alloc_ops (struct bkey_ops) { \
|
||||
.key_invalid = bch2_alloc_invalid, \
|
||||
.val_to_text = bch2_alloc_to_text, \
|
||||
}
|
||||
|
||||
struct dev_alloc_list {
|
||||
unsigned nr;
|
||||
u8 devs[BCH_SB_MEMBERS_MAX];
|
||||
};
|
||||
|
||||
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
|
||||
struct write_point *,
|
||||
struct bch_devs_mask *);
|
||||
void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
|
||||
struct write_point *);
|
||||
|
||||
int bch2_alloc_read(struct bch_fs *, struct list_head *);
|
||||
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
|
||||
|
||||
enum bucket_alloc_ret {
|
||||
ALLOC_SUCCESS = 0,
|
||||
OPEN_BUCKETS_EMPTY = -1,
|
||||
FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
|
||||
NO_DEVICES = -3, /* -EROFS */
|
||||
};
|
||||
|
||||
long bch2_bucket_alloc_new_fs(struct bch_dev *);
|
||||
|
||||
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
|
||||
struct closure *);
|
||||
|
||||
#define __writepoint_for_each_ptr(_wp, _ob, _i, _start) \
|
||||
for ((_i) = (_start); \
|
||||
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
|
||||
(_i)++)
|
||||
|
||||
#define writepoint_for_each_ptr_all(_wp, _ob, _i) \
|
||||
__writepoint_for_each_ptr(_wp, _ob, _i, 0)
|
||||
|
||||
#define writepoint_for_each_ptr(_wp, _ob, _i) \
|
||||
__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
|
||||
|
||||
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
||||
|
||||
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
if (atomic_dec_and_test(&ob->pin))
|
||||
__bch2_open_bucket_put(c, ob);
|
||||
}
|
||||
|
||||
static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < *nr; i++)
|
||||
bch2_open_bucket_put(c, c->open_buckets + refs[i]);
|
||||
|
||||
*nr = 0;
|
||||
}
|
||||
|
||||
static inline void bch2_open_bucket_get(struct bch_fs *c,
|
||||
struct write_point *wp,
|
||||
u8 *nr, u8 *refs)
|
||||
{
|
||||
struct open_bucket *ob;
|
||||
unsigned i;
|
||||
|
||||
writepoint_for_each_ptr(wp, ob, i) {
|
||||
atomic_inc(&ob->pin);
|
||||
refs[(*nr)++] = ob - c->open_buckets;
|
||||
}
|
||||
}
|
||||
|
||||
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
|
||||
unsigned,
|
||||
struct write_point_specifier,
|
||||
struct bch_devs_list *,
|
||||
unsigned, unsigned,
|
||||
enum alloc_reserve,
|
||||
unsigned,
|
||||
struct closure *);
|
||||
|
||||
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
|
||||
struct bkey_i_extent *, unsigned);
|
||||
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
|
||||
|
||||
static inline void bch2_wake_allocator(struct bch_dev *ca)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(ca->alloc_thread);
|
||||
if (p)
|
||||
wake_up_process(p);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
|
||||
{
|
||||
return (struct write_point_specifier) { .v = v | 1 };
|
||||
}
|
||||
|
||||
static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
|
||||
{
|
||||
return (struct write_point_specifier) { .v = (unsigned long) wp };
|
||||
}
|
||||
|
||||
void bch2_recalc_capacity(struct bch_fs *);
|
||||
|
||||
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
void bch2_dev_allocator_stop(struct bch_dev *);
|
||||
int bch2_dev_allocator_start(struct bch_dev *);
|
||||
|
||||
static inline void writepoint_init(struct write_point *wp,
|
||||
enum bch_data_type type)
|
||||
{
|
||||
mutex_init(&wp->lock);
|
||||
wp->type = type;
|
||||
}
|
||||
|
||||
int bch2_alloc_write(struct bch_fs *);
|
||||
int bch2_fs_allocator_start(struct bch_fs *);
|
||||
void bch2_fs_allocator_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_ALLOC_H */
|
90
fs/bcachefs/alloc_types.h
Normal file
90
fs/bcachefs/alloc_types.h
Normal file
@ -0,0 +1,90 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_ALLOC_TYPES_H
|
||||
#define _BCACHEFS_ALLOC_TYPES_H
|
||||
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include "clock_types.h"
|
||||
#include "fifo.h"
|
||||
|
||||
/* There's two of these clocks, one for reads and one for writes: */
|
||||
struct bucket_clock {
|
||||
/*
|
||||
* "now" in (read/write) IO time - incremented whenever we do X amount
|
||||
* of reads or writes.
|
||||
*
|
||||
* Goes with the bucket read/write prios: when we read or write to a
|
||||
* bucket we reset the bucket's prio to the current hand; thus hand -
|
||||
* prio = time since bucket was last read/written.
|
||||
*
|
||||
* The units are some amount (bytes/sectors) of data read/written, and
|
||||
* the units can change on the fly if we need to rescale to fit
|
||||
* everything in a u16 - your only guarantee is that the units are
|
||||
* consistent.
|
||||
*/
|
||||
u16 hand;
|
||||
u16 max_last_io;
|
||||
|
||||
int rw;
|
||||
|
||||
struct io_timer rescale;
|
||||
struct mutex lock;
|
||||
};
|
||||
|
||||
/* There is one reserve for each type of btree, one for prios and gens
|
||||
* and one for moving GC */
|
||||
enum alloc_reserve {
|
||||
RESERVE_ALLOC = -1,
|
||||
RESERVE_BTREE = 0,
|
||||
RESERVE_MOVINGGC = 1,
|
||||
RESERVE_NONE = 2,
|
||||
RESERVE_NR = 3,
|
||||
};
|
||||
|
||||
typedef FIFO(long) alloc_fifo;
|
||||
|
||||
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
|
||||
#define OPEN_BUCKETS_COUNT 256
|
||||
#define WRITE_POINT_COUNT 32
|
||||
|
||||
struct open_bucket {
|
||||
spinlock_t lock;
|
||||
atomic_t pin;
|
||||
u8 freelist;
|
||||
bool valid;
|
||||
bool on_partial_list;
|
||||
unsigned sectors_free;
|
||||
struct bch_extent_ptr ptr;
|
||||
};
|
||||
|
||||
struct write_point {
|
||||
struct hlist_node node;
|
||||
struct mutex lock;
|
||||
u64 last_used;
|
||||
unsigned long write_point;
|
||||
enum bch_data_type type;
|
||||
|
||||
u8 nr_ptrs;
|
||||
u8 first_ptr;
|
||||
|
||||
/* calculated based on how many pointers we're actually going to use: */
|
||||
unsigned sectors_free;
|
||||
|
||||
struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
|
||||
u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
||||
};
|
||||
|
||||
struct write_point_specifier {
|
||||
unsigned long v;
|
||||
};
|
||||
|
||||
struct alloc_heap_entry {
|
||||
size_t bucket;
|
||||
size_t nr;
|
||||
unsigned long key;
|
||||
};
|
||||
|
||||
typedef HEAP(struct alloc_heap_entry) alloc_heap;
|
||||
|
||||
#endif /* _BCACHEFS_ALLOC_TYPES_H */
|
785
fs/bcachefs/bcachefs.h
Normal file
785
fs/bcachefs/bcachefs.h
Normal file
@ -0,0 +1,785 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_H
|
||||
#define _BCACHEFS_H
|
||||
|
||||
/*
|
||||
* SOME HIGH LEVEL CODE DOCUMENTATION:
|
||||
*
|
||||
* Bcache mostly works with cache sets, cache devices, and backing devices.
|
||||
*
|
||||
* Support for multiple cache devices hasn't quite been finished off yet, but
|
||||
* it's about 95% plumbed through. A cache set and its cache devices is sort of
|
||||
* like a md raid array and its component devices. Most of the code doesn't care
|
||||
* about individual cache devices, the main abstraction is the cache set.
|
||||
*
|
||||
* Multiple cache devices is intended to give us the ability to mirror dirty
|
||||
* cached data and metadata, without mirroring clean cached data.
|
||||
*
|
||||
* Backing devices are different, in that they have a lifetime independent of a
|
||||
* cache set. When you register a newly formatted backing device it'll come up
|
||||
* in passthrough mode, and then you can attach and detach a backing device from
|
||||
* a cache set at runtime - while it's mounted and in use. Detaching implicitly
|
||||
* invalidates any cached data for that backing device.
|
||||
*
|
||||
* A cache set can have multiple (many) backing devices attached to it.
|
||||
*
|
||||
* There's also flash only volumes - this is the reason for the distinction
|
||||
* between struct cached_dev and struct bcache_device. A flash only volume
|
||||
* works much like a bcache device that has a backing device, except the
|
||||
* "cached" data is always dirty. The end result is that we get thin
|
||||
* provisioning with very little additional code.
|
||||
*
|
||||
* Flash only volumes work but they're not production ready because the moving
|
||||
* garbage collector needs more work. More on that later.
|
||||
*
|
||||
* BUCKETS/ALLOCATION:
|
||||
*
|
||||
* Bcache is primarily designed for caching, which means that in normal
|
||||
* operation all of our available space will be allocated. Thus, we need an
|
||||
* efficient way of deleting things from the cache so we can write new things to
|
||||
* it.
|
||||
*
|
||||
* To do this, we first divide the cache device up into buckets. A bucket is the
|
||||
* unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
|
||||
* works efficiently.
|
||||
*
|
||||
* Each bucket has a 16 bit priority, and an 8 bit generation associated with
|
||||
* it. The gens and priorities for all the buckets are stored contiguously and
|
||||
* packed on disk (in a linked list of buckets - aside from the superblock, all
|
||||
* of bcache's metadata is stored in buckets).
|
||||
*
|
||||
* The priority is used to implement an LRU. We reset a bucket's priority when
|
||||
* we allocate it or on cache it, and every so often we decrement the priority
|
||||
* of each bucket. It could be used to implement something more sophisticated,
|
||||
* if anyone ever gets around to it.
|
||||
*
|
||||
* The generation is used for invalidating buckets. Each pointer also has an 8
|
||||
* bit generation embedded in it; for a pointer to be considered valid, its gen
|
||||
* must match the gen of the bucket it points into. Thus, to reuse a bucket all
|
||||
* we have to do is increment its gen (and write its new gen to disk; we batch
|
||||
* this up).
|
||||
*
|
||||
* Bcache is entirely COW - we never write twice to a bucket, even buckets that
|
||||
* contain metadata (including btree nodes).
|
||||
*
|
||||
* THE BTREE:
|
||||
*
|
||||
* Bcache is in large part design around the btree.
|
||||
*
|
||||
* At a high level, the btree is just an index of key -> ptr tuples.
|
||||
*
|
||||
* Keys represent extents, and thus have a size field. Keys also have a variable
|
||||
* number of pointers attached to them (potentially zero, which is handy for
|
||||
* invalidating the cache).
|
||||
*
|
||||
* The key itself is an inode:offset pair. The inode number corresponds to a
|
||||
* backing device or a flash only volume. The offset is the ending offset of the
|
||||
* extent within the inode - not the starting offset; this makes lookups
|
||||
* slightly more convenient.
|
||||
*
|
||||
* Pointers contain the cache device id, the offset on that device, and an 8 bit
|
||||
* generation number. More on the gen later.
|
||||
*
|
||||
* Index lookups are not fully abstracted - cache lookups in particular are
|
||||
* still somewhat mixed in with the btree code, but things are headed in that
|
||||
* direction.
|
||||
*
|
||||
* Updates are fairly well abstracted, though. There are two different ways of
|
||||
* updating the btree; insert and replace.
|
||||
*
|
||||
* BTREE_INSERT will just take a list of keys and insert them into the btree -
|
||||
* overwriting (possibly only partially) any extents they overlap with. This is
|
||||
* used to update the index after a write.
|
||||
*
|
||||
* BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
|
||||
* overwriting a key that matches another given key. This is used for inserting
|
||||
* data into the cache after a cache miss, and for background writeback, and for
|
||||
* the moving garbage collector.
|
||||
*
|
||||
* There is no "delete" operation; deleting things from the index is
|
||||
* accomplished by either by invalidating pointers (by incrementing a bucket's
|
||||
* gen) or by inserting a key with 0 pointers - which will overwrite anything
|
||||
* previously present at that location in the index.
|
||||
*
|
||||
* This means that there are always stale/invalid keys in the btree. They're
|
||||
* filtered out by the code that iterates through a btree node, and removed when
|
||||
* a btree node is rewritten.
|
||||
*
|
||||
* BTREE NODES:
|
||||
*
|
||||
* Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
|
||||
* free smaller than a bucket - so, that's how big our btree nodes are.
|
||||
*
|
||||
* (If buckets are really big we'll only use part of the bucket for a btree node
|
||||
* - no less than 1/4th - but a bucket still contains no more than a single
|
||||
* btree node. I'd actually like to change this, but for now we rely on the
|
||||
* bucket's gen for deleting btree nodes when we rewrite/split a node.)
|
||||
*
|
||||
* Anyways, btree nodes are big - big enough to be inefficient with a textbook
|
||||
* btree implementation.
|
||||
*
|
||||
* The way this is solved is that btree nodes are internally log structured; we
|
||||
* can append new keys to an existing btree node without rewriting it. This
|
||||
* means each set of keys we write is sorted, but the node is not.
|
||||
*
|
||||
* We maintain this log structure in memory - keeping 1Mb of keys sorted would
|
||||
* be expensive, and we have to distinguish between the keys we have written and
|
||||
* the keys we haven't. So to do a lookup in a btree node, we have to search
|
||||
* each sorted set. But we do merge written sets together lazily, so the cost of
|
||||
* these extra searches is quite low (normally most of the keys in a btree node
|
||||
* will be in one big set, and then there'll be one or two sets that are much
|
||||
* smaller).
|
||||
*
|
||||
* This log structure makes bcache's btree more of a hybrid between a
|
||||
* conventional btree and a compacting data structure, with some of the
|
||||
* advantages of both.
|
||||
*
|
||||
* GARBAGE COLLECTION:
|
||||
*
|
||||
* We can't just invalidate any bucket - it might contain dirty data or
|
||||
* metadata. If it once contained dirty data, other writes might overwrite it
|
||||
* later, leaving no valid pointers into that bucket in the index.
|
||||
*
|
||||
* Thus, the primary purpose of garbage collection is to find buckets to reuse.
|
||||
* It also counts how much valid data it each bucket currently contains, so that
|
||||
* allocation can reuse buckets sooner when they've been mostly overwritten.
|
||||
*
|
||||
* It also does some things that are really internal to the btree
|
||||
* implementation. If a btree node contains pointers that are stale by more than
|
||||
* some threshold, it rewrites the btree node to avoid the bucket's generation
|
||||
* wrapping around. It also merges adjacent btree nodes if they're empty enough.
|
||||
*
|
||||
* THE JOURNAL:
|
||||
*
|
||||
* Bcache's journal is not necessary for consistency; we always strictly
|
||||
* order metadata writes so that the btree and everything else is consistent on
|
||||
* disk in the event of an unclean shutdown, and in fact bcache had writeback
|
||||
* caching (with recovery from unclean shutdown) before journalling was
|
||||
* implemented.
|
||||
*
|
||||
* Rather, the journal is purely a performance optimization; we can't complete a
|
||||
* write until we've updated the index on disk, otherwise the cache would be
|
||||
* inconsistent in the event of an unclean shutdown. This means that without the
|
||||
* journal, on random write workloads we constantly have to update all the leaf
|
||||
* nodes in the btree, and those writes will be mostly empty (appending at most
|
||||
* a few keys each) - highly inefficient in terms of amount of metadata writes,
|
||||
* and it puts more strain on the various btree resorting/compacting code.
|
||||
*
|
||||
* The journal is just a log of keys we've inserted; on startup we just reinsert
|
||||
* all the keys in the open journal entries. That means that when we're updating
|
||||
* a node in the btree, we can wait until a 4k block of keys fills up before
|
||||
* writing them out.
|
||||
*
|
||||
* For simplicity, we only journal updates to leaf nodes; updates to parent
|
||||
* nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
|
||||
* the complexity to deal with journalling them (in particular, journal replay)
|
||||
* - updates to non leaf nodes just happen synchronously (see btree_split()).
|
||||
*/
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
|
||||
|
||||
#include <linux/backing-dev-defs.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/closure.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/percpu-refcount.h>
|
||||
#include <linux/percpu-rwsem.h>
|
||||
#include <linux/rhashtable.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/shrinker.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/zstd.h>
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
#include "fifo.h"
|
||||
#include "opts.h"
|
||||
#include "util.h"
|
||||
|
||||
#define dynamic_fault(...) 0
|
||||
#define race_fault(...) 0
|
||||
|
||||
#define bch2_fs_init_fault(name) \
|
||||
dynamic_fault("bcachefs:bch_fs_init:" name)
|
||||
#define bch2_meta_read_fault(name) \
|
||||
dynamic_fault("bcachefs:meta:read:" name)
|
||||
#define bch2_meta_write_fault(name) \
|
||||
dynamic_fault("bcachefs:meta:write:" name)
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
|
||||
#else
|
||||
#define bch2_fmt(_c, fmt) fmt "\n"
|
||||
#endif
|
||||
|
||||
#define bch_info(c, fmt, ...) \
|
||||
printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
#define bch_notice(c, fmt, ...) \
|
||||
printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
#define bch_warn(c, fmt, ...) \
|
||||
printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
#define bch_err(c, fmt, ...) \
|
||||
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
|
||||
#define bch_verbose(c, fmt, ...) \
|
||||
do { \
|
||||
if ((c)->opts.verbose_recovery) \
|
||||
bch_info(c, fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define pr_verbose_init(opts, fmt, ...) \
|
||||
do { \
|
||||
if (opt_get(opts, verbose_init)) \
|
||||
pr_info(fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
/* Parameters that are useful for debugging, but should always be compiled in: */
|
||||
#define BCH_DEBUG_PARAMS_ALWAYS() \
|
||||
BCH_DEBUG_PARAM(key_merging_disabled, \
|
||||
"Disables merging of extents") \
|
||||
BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
|
||||
"Causes mark and sweep to compact and rewrite every " \
|
||||
"btree node it traverses") \
|
||||
BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
|
||||
"Disables rewriting of btree nodes during mark and sweep")\
|
||||
BCH_DEBUG_PARAM(btree_shrinker_disabled, \
|
||||
"Disables the shrinker callback for the btree node cache")
|
||||
|
||||
/* Parameters that should only be compiled in in debug mode: */
|
||||
#define BCH_DEBUG_PARAMS_DEBUG() \
|
||||
BCH_DEBUG_PARAM(expensive_debug_checks, \
|
||||
"Enables various runtime debugging checks that " \
|
||||
"significantly affect performance") \
|
||||
BCH_DEBUG_PARAM(debug_check_bkeys, \
|
||||
"Run bkey_debugcheck (primarily checking GC/allocation "\
|
||||
"information) when iterating over keys") \
|
||||
BCH_DEBUG_PARAM(verify_btree_ondisk, \
|
||||
"Reread btree nodes at various points to verify the " \
|
||||
"mergesort in the read path against modifications " \
|
||||
"done in memory") \
|
||||
BCH_DEBUG_PARAM(journal_seq_verify, \
|
||||
"Store the journal sequence number in the version " \
|
||||
"number of every btree key, and verify that btree " \
|
||||
"update ordering is preserved during recovery") \
|
||||
BCH_DEBUG_PARAM(inject_invalid_keys, \
|
||||
"Store the journal sequence number in the version " \
|
||||
"number of every btree key, and verify that btree " \
|
||||
"update ordering is preserved during recovery") \
|
||||
|
||||
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
|
||||
#else
|
||||
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
|
||||
#endif
|
||||
|
||||
#define BCH_TIME_STATS() \
|
||||
x(btree_node_mem_alloc) \
|
||||
x(btree_gc) \
|
||||
x(btree_split) \
|
||||
x(btree_sort) \
|
||||
x(btree_read) \
|
||||
x(btree_lock_contended_read) \
|
||||
x(btree_lock_contended_intent) \
|
||||
x(btree_lock_contended_write) \
|
||||
x(data_write) \
|
||||
x(data_read) \
|
||||
x(data_promote) \
|
||||
x(journal_write) \
|
||||
x(journal_delay) \
|
||||
x(journal_blocked) \
|
||||
x(journal_flush_seq)
|
||||
|
||||
enum bch_time_stats {
|
||||
#define x(name) BCH_TIME_##name,
|
||||
BCH_TIME_STATS()
|
||||
#undef x
|
||||
BCH_TIME_STAT_NR
|
||||
};
|
||||
|
||||
#include "alloc_types.h"
|
||||
#include "btree_types.h"
|
||||
#include "buckets_types.h"
|
||||
#include "clock_types.h"
|
||||
#include "journal_types.h"
|
||||
#include "keylist_types.h"
|
||||
#include "quota_types.h"
|
||||
#include "rebalance_types.h"
|
||||
#include "super_types.h"
|
||||
|
||||
/* Number of nodes btree coalesce will try to coalesce at once */
|
||||
#define GC_MERGE_NODES 4U
|
||||
|
||||
/* Maximum number of nodes we might need to allocate atomically: */
|
||||
#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
|
||||
|
||||
/* Size of the freelist we allocate btree nodes from: */
|
||||
#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
|
||||
|
||||
struct btree;
|
||||
|
||||
enum gc_phase {
|
||||
GC_PHASE_START,
|
||||
GC_PHASE_SB,
|
||||
|
||||
#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
|
||||
DEFINE_BCH_BTREE_IDS()
|
||||
#undef DEF_BTREE_ID
|
||||
|
||||
GC_PHASE_PENDING_DELETE,
|
||||
GC_PHASE_ALLOC,
|
||||
GC_PHASE_DONE
|
||||
};
|
||||
|
||||
struct gc_pos {
|
||||
enum gc_phase phase;
|
||||
struct bpos pos;
|
||||
unsigned level;
|
||||
};
|
||||
|
||||
struct io_count {
|
||||
u64 sectors[2][BCH_DATA_NR];
|
||||
};
|
||||
|
||||
struct bch_dev {
|
||||
struct kobject kobj;
|
||||
struct percpu_ref ref;
|
||||
struct completion ref_completion;
|
||||
struct percpu_ref io_ref;
|
||||
struct completion io_ref_completion;
|
||||
|
||||
struct bch_fs *fs;
|
||||
|
||||
u8 dev_idx;
|
||||
/*
|
||||
* Cached version of this device's member info from superblock
|
||||
* Committed by bch2_write_super() -> bch_fs_mi_update()
|
||||
*/
|
||||
struct bch_member_cpu mi;
|
||||
__uuid_t uuid;
|
||||
char name[BDEVNAME_SIZE];
|
||||
|
||||
struct bch_sb_handle disk_sb;
|
||||
int sb_write_error;
|
||||
|
||||
struct bch_devs_mask self;
|
||||
|
||||
/* biosets used in cloned bios for writing multiple replicas */
|
||||
struct bio_set replica_set;
|
||||
|
||||
/*
|
||||
* Buckets:
|
||||
* Per-bucket arrays are protected by c->usage_lock, bucket_lock and
|
||||
* gc_lock, for device resize - holding any is sufficient for access:
|
||||
* Or rcu_read_lock(), but only for ptr_stale():
|
||||
*/
|
||||
struct bucket_array __rcu *buckets;
|
||||
unsigned long *buckets_dirty;
|
||||
/* most out of date gen in the btree */
|
||||
u8 *oldest_gens;
|
||||
struct rw_semaphore bucket_lock;
|
||||
|
||||
struct bch_dev_usage __percpu *usage_percpu;
|
||||
struct bch_dev_usage usage_cached;
|
||||
|
||||
/* Allocator: */
|
||||
struct task_struct __rcu *alloc_thread;
|
||||
|
||||
/*
|
||||
* free: Buckets that are ready to be used
|
||||
*
|
||||
* free_inc: Incoming buckets - these are buckets that currently have
|
||||
* cached data in them, and we can't reuse them until after we write
|
||||
* their new gen to disk. After prio_write() finishes writing the new
|
||||
* gens/prios, they'll be moved to the free list (and possibly discarded
|
||||
* in the process)
|
||||
*/
|
||||
alloc_fifo free[RESERVE_NR];
|
||||
alloc_fifo free_inc;
|
||||
spinlock_t freelist_lock;
|
||||
size_t nr_invalidated;
|
||||
|
||||
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
|
||||
unsigned open_buckets_partial_nr;
|
||||
|
||||
size_t fifo_last_bucket;
|
||||
|
||||
/* last calculated minimum prio */
|
||||
u16 max_last_bucket_io[2];
|
||||
|
||||
atomic_long_t saturated_count;
|
||||
size_t inc_gen_needs_gc;
|
||||
size_t inc_gen_really_needs_gc;
|
||||
u64 allocator_journal_seq_flush;
|
||||
bool allocator_invalidating_data;
|
||||
bool allocator_blocked;
|
||||
|
||||
alloc_heap alloc_heap;
|
||||
|
||||
/* Copying GC: */
|
||||
struct task_struct *copygc_thread;
|
||||
copygc_heap copygc_heap;
|
||||
struct bch_pd_controller copygc_pd;
|
||||
struct write_point copygc_write_point;
|
||||
|
||||
atomic64_t rebalance_work;
|
||||
|
||||
struct journal_device journal;
|
||||
|
||||
struct work_struct io_error_work;
|
||||
|
||||
/* The rest of this all shows up in sysfs */
|
||||
atomic64_t cur_latency[2];
|
||||
struct bch2_time_stats io_latency[2];
|
||||
|
||||
#define CONGESTED_MAX 1024
|
||||
atomic_t congested;
|
||||
u64 congested_last;
|
||||
|
||||
struct io_count __percpu *io_done;
|
||||
};
|
||||
|
||||
/*
|
||||
* Flag bits for what phase of startup/shutdown the cache set is at, how we're
|
||||
* shutting down, etc.:
|
||||
*
|
||||
* BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
|
||||
* all the backing devices first (their cached data gets invalidated, and they
|
||||
* won't automatically reattach).
|
||||
*/
|
||||
enum {
|
||||
/* startup: */
|
||||
BCH_FS_ALLOC_READ_DONE,
|
||||
BCH_FS_ALLOCATOR_STARTED,
|
||||
BCH_FS_INITIAL_GC_DONE,
|
||||
BCH_FS_FSCK_DONE,
|
||||
BCH_FS_STARTED,
|
||||
|
||||
/* shutdown: */
|
||||
BCH_FS_EMERGENCY_RO,
|
||||
BCH_FS_WRITE_DISABLE_COMPLETE,
|
||||
|
||||
/* errors: */
|
||||
BCH_FS_ERROR,
|
||||
BCH_FS_GC_FAILURE,
|
||||
|
||||
/* misc: */
|
||||
BCH_FS_BDEV_MOUNTED,
|
||||
BCH_FS_FSCK_FIXED_ERRORS,
|
||||
BCH_FS_FIXED_GENS,
|
||||
BCH_FS_REBUILD_REPLICAS,
|
||||
BCH_FS_HOLD_BTREE_WRITES,
|
||||
};
|
||||
|
||||
struct btree_debug {
|
||||
unsigned id;
|
||||
struct dentry *btree;
|
||||
struct dentry *btree_format;
|
||||
struct dentry *failed;
|
||||
};
|
||||
|
||||
enum bch_fs_state {
|
||||
BCH_FS_STARTING = 0,
|
||||
BCH_FS_STOPPING,
|
||||
BCH_FS_RO,
|
||||
BCH_FS_RW,
|
||||
};
|
||||
|
||||
struct bch_fs {
|
||||
struct closure cl;
|
||||
|
||||
struct list_head list;
|
||||
struct kobject kobj;
|
||||
struct kobject internal;
|
||||
struct kobject opts_dir;
|
||||
struct kobject time_stats;
|
||||
unsigned long flags;
|
||||
|
||||
int minor;
|
||||
struct device *chardev;
|
||||
struct super_block *vfs_sb;
|
||||
char name[40];
|
||||
|
||||
/* ro/rw, add/remove devices: */
|
||||
struct mutex state_lock;
|
||||
enum bch_fs_state state;
|
||||
|
||||
/* Counts outstanding writes, for clean transition to read-only */
|
||||
struct percpu_ref writes;
|
||||
struct work_struct read_only_work;
|
||||
|
||||
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
|
||||
|
||||
struct bch_replicas_cpu __rcu *replicas;
|
||||
struct bch_replicas_cpu __rcu *replicas_gc;
|
||||
struct mutex replicas_gc_lock;
|
||||
|
||||
struct bch_disk_groups_cpu __rcu *disk_groups;
|
||||
|
||||
struct bch_opts opts;
|
||||
|
||||
/* Updated by bch2_sb_update():*/
|
||||
struct {
|
||||
__uuid_t uuid;
|
||||
__uuid_t user_uuid;
|
||||
|
||||
u16 encoded_extent_max;
|
||||
|
||||
u8 nr_devices;
|
||||
u8 clean;
|
||||
|
||||
u8 encryption_type;
|
||||
|
||||
u64 time_base_lo;
|
||||
u32 time_base_hi;
|
||||
u32 time_precision;
|
||||
u64 features;
|
||||
} sb;
|
||||
|
||||
struct bch_sb_handle disk_sb;
|
||||
|
||||
unsigned short block_bits; /* ilog2(block_size) */
|
||||
|
||||
u16 btree_foreground_merge_threshold;
|
||||
|
||||
struct closure sb_write;
|
||||
struct mutex sb_lock;
|
||||
|
||||
/* BTREE CACHE */
|
||||
struct bio_set btree_bio;
|
||||
|
||||
struct btree_root btree_roots[BTREE_ID_NR];
|
||||
bool btree_roots_dirty;
|
||||
struct mutex btree_root_lock;
|
||||
|
||||
struct btree_cache btree_cache;
|
||||
|
||||
mempool_t btree_reserve_pool;
|
||||
|
||||
/*
|
||||
* Cache of allocated btree nodes - if we allocate a btree node and
|
||||
* don't use it, if we free it that space can't be reused until going
|
||||
* _all_ the way through the allocator (which exposes us to a livelock
|
||||
* when allocating btree reserves fail halfway through) - instead, we
|
||||
* can stick them here:
|
||||
*/
|
||||
struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
|
||||
unsigned btree_reserve_cache_nr;
|
||||
struct mutex btree_reserve_cache_lock;
|
||||
|
||||
mempool_t btree_interior_update_pool;
|
||||
struct list_head btree_interior_update_list;
|
||||
struct mutex btree_interior_update_lock;
|
||||
struct closure_waitlist btree_interior_update_wait;
|
||||
|
||||
struct workqueue_struct *wq;
|
||||
/* copygc needs its own workqueue for index updates.. */
|
||||
struct workqueue_struct *copygc_wq;
|
||||
|
||||
/* ALLOCATION */
|
||||
struct delayed_work pd_controllers_update;
|
||||
unsigned pd_controllers_update_seconds;
|
||||
|
||||
struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
||||
|
||||
u64 capacity; /* sectors */
|
||||
|
||||
/*
|
||||
* When capacity _decreases_ (due to a disk being removed), we
|
||||
* increment capacity_gen - this invalidates outstanding reservations
|
||||
* and forces them to be revalidated
|
||||
*/
|
||||
u32 capacity_gen;
|
||||
|
||||
atomic64_t sectors_available;
|
||||
|
||||
struct bch_fs_usage __percpu *usage_percpu;
|
||||
struct bch_fs_usage usage_cached;
|
||||
struct percpu_rw_semaphore usage_lock;
|
||||
|
||||
struct closure_waitlist freelist_wait;
|
||||
|
||||
/*
|
||||
* When we invalidate buckets, we use both the priority and the amount
|
||||
* of good data to determine which buckets to reuse first - to weight
|
||||
* those together consistently we keep track of the smallest nonzero
|
||||
* priority of any bucket.
|
||||
*/
|
||||
struct bucket_clock bucket_clock[2];
|
||||
|
||||
struct io_clock io_clock[2];
|
||||
|
||||
/* ALLOCATOR */
|
||||
spinlock_t freelist_lock;
|
||||
u8 open_buckets_freelist;
|
||||
u8 open_buckets_nr_free;
|
||||
struct closure_waitlist open_buckets_wait;
|
||||
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
|
||||
|
||||
struct write_point btree_write_point;
|
||||
struct write_point rebalance_write_point;
|
||||
|
||||
struct write_point write_points[WRITE_POINT_COUNT];
|
||||
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
|
||||
struct mutex write_points_hash_lock;
|
||||
|
||||
/* GARBAGE COLLECTION */
|
||||
struct task_struct *gc_thread;
|
||||
atomic_t kick_gc;
|
||||
unsigned long gc_count;
|
||||
|
||||
/*
|
||||
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
|
||||
* has been marked by GC.
|
||||
*
|
||||
* gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
|
||||
*
|
||||
* gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
|
||||
* currently running, and gc marks are currently valid
|
||||
*
|
||||
* Protected by gc_pos_lock. Only written to by GC thread, so GC thread
|
||||
* can read without a lock.
|
||||
*/
|
||||
seqcount_t gc_pos_lock;
|
||||
struct gc_pos gc_pos;
|
||||
|
||||
/*
|
||||
* The allocation code needs gc_mark in struct bucket to be correct, but
|
||||
* it's not while a gc is in progress.
|
||||
*/
|
||||
struct rw_semaphore gc_lock;
|
||||
|
||||
/* IO PATH */
|
||||
struct bio_set bio_read;
|
||||
struct bio_set bio_read_split;
|
||||
struct bio_set bio_write;
|
||||
struct mutex bio_bounce_pages_lock;
|
||||
mempool_t bio_bounce_pages;
|
||||
struct rhashtable promote_table;
|
||||
|
||||
mempool_t compression_bounce[2];
|
||||
mempool_t compress_workspace[BCH_COMPRESSION_NR];
|
||||
mempool_t decompress_workspace;
|
||||
ZSTD_parameters zstd_params;
|
||||
|
||||
struct crypto_shash *sha256;
|
||||
struct crypto_sync_skcipher *chacha20;
|
||||
struct crypto_shash *poly1305;
|
||||
|
||||
atomic64_t key_version;
|
||||
|
||||
/* REBALANCE */
|
||||
struct bch_fs_rebalance rebalance;
|
||||
|
||||
/* VFS IO PATH - fs-io.c */
|
||||
struct bio_set writepage_bioset;
|
||||
struct bio_set dio_write_bioset;
|
||||
struct bio_set dio_read_bioset;
|
||||
|
||||
struct bio_list btree_write_error_list;
|
||||
struct work_struct btree_write_error_work;
|
||||
spinlock_t btree_write_error_lock;
|
||||
|
||||
/* ERRORS */
|
||||
struct list_head fsck_errors;
|
||||
struct mutex fsck_error_lock;
|
||||
bool fsck_alloc_err;
|
||||
|
||||
/* FILESYSTEM */
|
||||
atomic_long_t nr_inodes;
|
||||
|
||||
/* QUOTAS */
|
||||
struct bch_memquota_type quotas[QTYP_NR];
|
||||
|
||||
/* DEBUG JUNK */
|
||||
struct dentry *debug;
|
||||
struct btree_debug btree_debug[BTREE_ID_NR];
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
struct btree *verify_data;
|
||||
struct btree_node *verify_ondisk;
|
||||
struct mutex verify_lock;
|
||||
#endif
|
||||
|
||||
u64 unused_inode_hint;
|
||||
|
||||
/*
|
||||
* A btree node on disk could have too many bsets for an iterator to fit
|
||||
* on the stack - have to dynamically allocate them
|
||||
*/
|
||||
mempool_t fill_iter;
|
||||
|
||||
mempool_t btree_bounce_pool;
|
||||
|
||||
struct journal journal;
|
||||
|
||||
unsigned bucket_journal_seq;
|
||||
|
||||
/* The rest of this all shows up in sysfs */
|
||||
atomic_long_t read_realloc_races;
|
||||
atomic_long_t extent_migrate_done;
|
||||
atomic_long_t extent_migrate_raced;
|
||||
|
||||
unsigned btree_gc_periodic:1;
|
||||
unsigned copy_gc_enabled:1;
|
||||
bool promote_whole_extents;
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) bool name;
|
||||
BCH_DEBUG_PARAMS_ALL()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
struct bch2_time_stats times[BCH_TIME_STAT_NR];
|
||||
};
|
||||
|
||||
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
|
||||
{
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
if (c->vfs_sb)
|
||||
c->vfs_sb->s_bdi->ra_pages = ra_pages;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool bch2_fs_running(struct bch_fs *c)
|
||||
{
|
||||
return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
|
||||
}
|
||||
|
||||
static inline unsigned bucket_bytes(const struct bch_dev *ca)
|
||||
{
|
||||
return ca->mi.bucket_size << 9;
|
||||
}
|
||||
|
||||
static inline unsigned block_bytes(const struct bch_fs *c)
|
||||
{
|
||||
return c->opts.block_size << 9;
|
||||
}
|
||||
|
||||
static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
|
||||
{
|
||||
return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
|
||||
}
|
||||
|
||||
static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
|
||||
{
|
||||
s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
|
||||
|
||||
if (c->sb.time_precision == 1)
|
||||
return ns;
|
||||
|
||||
return div_s64(ns, c->sb.time_precision);
|
||||
}
|
||||
|
||||
static inline s64 bch2_current_time(struct bch_fs *c)
|
||||
{
|
||||
struct timespec64 now;
|
||||
|
||||
ktime_get_real_ts64(&now);
|
||||
return timespec_to_bch2_time(c, now);
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_H */
|
1498
fs/bcachefs/bcachefs_format.h
Normal file
1498
fs/bcachefs/bcachefs_format.h
Normal file
File diff suppressed because it is too large
Load Diff
310
fs/bcachefs/bcachefs_ioctl.h
Normal file
310
fs/bcachefs/bcachefs_ioctl.h
Normal file
@ -0,0 +1,310 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_IOCTL_H
|
||||
#define _BCACHEFS_IOCTL_H
|
||||
|
||||
#include <linux/uuid.h>
|
||||
#include <asm/ioctl.h>
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
/*
|
||||
* Flags common to multiple ioctls:
|
||||
*/
|
||||
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
|
||||
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
|
||||
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
|
||||
#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
|
||||
|
||||
#define BCH_FORCE_IF_DEGRADED \
|
||||
(BCH_FORCE_IF_DATA_DEGRADED| \
|
||||
BCH_FORCE_IF_METADATA_DEGRADED)
|
||||
|
||||
/*
|
||||
* If cleared, ioctl that refer to a device pass it as a pointer to a pathname
|
||||
* (e.g. /dev/sda1); if set, the dev field is the device's index within the
|
||||
* filesystem:
|
||||
*/
|
||||
#define BCH_BY_INDEX (1 << 4)
|
||||
|
||||
/*
|
||||
* For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
|
||||
* wide superblock:
|
||||
*/
|
||||
#define BCH_READ_DEV (1 << 5)
|
||||
|
||||
/* global control dev: */
|
||||
|
||||
/* These are currently broken, and probably unnecessary: */
|
||||
#if 0
|
||||
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
|
||||
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
|
||||
|
||||
struct bch_ioctl_assemble {
|
||||
__u32 flags;
|
||||
__u32 nr_devs;
|
||||
__u64 pad;
|
||||
__u64 devs[];
|
||||
};
|
||||
|
||||
struct bch_ioctl_incremental {
|
||||
__u32 flags;
|
||||
__u64 pad;
|
||||
__u64 dev;
|
||||
};
|
||||
#endif
|
||||
|
||||
/* filesystem ioctls: */
|
||||
|
||||
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
|
||||
|
||||
/* These only make sense when we also have incremental assembly */
|
||||
#if 0
|
||||
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
|
||||
#define BCH_IOCTL_STOP _IO(0xbc, 3)
|
||||
#endif
|
||||
|
||||
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
|
||||
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
|
||||
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
|
||||
#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
|
||||
#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
|
||||
#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
|
||||
#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage)
|
||||
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
|
||||
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
|
||||
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
|
||||
*
|
||||
* Returns user visible UUID, not internal UUID (which may not ever be changed);
|
||||
* the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
|
||||
* this UUID.
|
||||
*/
|
||||
struct bch_ioctl_query_uuid {
|
||||
__uuid_t uuid;
|
||||
};
|
||||
|
||||
#if 0
|
||||
struct bch_ioctl_start {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
|
||||
*
|
||||
* The specified device must not be open or in use. On success, the new device
|
||||
* will be an online member of the filesystem just like any other member.
|
||||
*
|
||||
* The device must first be prepared by userspace by formatting with a bcachefs
|
||||
* superblock, which is only used for passing in superblock options/parameters
|
||||
* for that device (in struct bch_member). The new device's superblock should
|
||||
* not claim to be a member of any existing filesystem - UUIDs on it will be
|
||||
* ignored.
|
||||
*/
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
|
||||
*
|
||||
* Any data present on @dev will be permanently deleted, and @dev will be
|
||||
* removed from its slot in the filesystem's list of member devices. The device
|
||||
* may be either offline or offline.
|
||||
*
|
||||
* Will fail removing @dev would leave us with insufficient read write devices
|
||||
* or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
|
||||
* set.
|
||||
*/
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
|
||||
* but is not open (e.g. because we started in degraded mode), bring it online
|
||||
*
|
||||
* all existing data on @dev will be available once the device is online,
|
||||
* exactly as if @dev was present when the filesystem was first mounted
|
||||
*/
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
|
||||
* block device, without removing it from the filesystem (so it can be brought
|
||||
* back online later)
|
||||
*
|
||||
* Data present on @dev will be unavailable while @dev is offline (unless
|
||||
* replicated), but will still be intact and untouched if @dev is brought back
|
||||
* online
|
||||
*
|
||||
* Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
|
||||
* leave us with insufficient read write devices or degraded/unavailable data,
|
||||
* unless the approprate BCH_FORCE_IF_* flags are set.
|
||||
*/
|
||||
|
||||
struct bch_ioctl_disk {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
__u64 dev;
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
|
||||
*
|
||||
* @new_state - one of the bch_member_state states (rw, ro, failed,
|
||||
* spare)
|
||||
*
|
||||
* Will refuse to change member state if we would then have insufficient devices
|
||||
* to write to, or if it would result in degraded data (when @new_state is
|
||||
* failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
|
||||
*/
|
||||
struct bch_ioctl_disk_set_state {
|
||||
__u32 flags;
|
||||
__u8 new_state;
|
||||
__u8 pad[3];
|
||||
__u64 dev;
|
||||
};
|
||||
|
||||
enum bch_data_ops {
|
||||
BCH_DATA_OP_SCRUB = 0,
|
||||
BCH_DATA_OP_REREPLICATE = 1,
|
||||
BCH_DATA_OP_MIGRATE = 2,
|
||||
BCH_DATA_OP_NR = 3,
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
|
||||
* scrub, rereplicate, migrate).
|
||||
*
|
||||
* This ioctl kicks off a job in the background, and returns a file descriptor.
|
||||
* Reading from the file descriptor returns a struct bch_ioctl_data_event,
|
||||
* indicating current progress, and closing the file descriptor will stop the
|
||||
* job. The file descriptor is O_CLOEXEC.
|
||||
*/
|
||||
struct bch_ioctl_data {
|
||||
__u32 op;
|
||||
__u32 flags;
|
||||
|
||||
struct bpos start;
|
||||
struct bpos end;
|
||||
|
||||
union {
|
||||
struct {
|
||||
__u32 dev;
|
||||
__u32 pad;
|
||||
} migrate;
|
||||
struct {
|
||||
__u64 pad[8];
|
||||
};
|
||||
};
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
enum bch_data_event {
|
||||
BCH_DATA_EVENT_PROGRESS = 0,
|
||||
/* XXX: add an event for reporting errors */
|
||||
BCH_DATA_EVENT_NR = 1,
|
||||
};
|
||||
|
||||
struct bch_ioctl_data_progress {
|
||||
__u8 data_type;
|
||||
__u8 btree_id;
|
||||
__u8 pad[2];
|
||||
struct bpos pos;
|
||||
|
||||
__u64 sectors_done;
|
||||
__u64 sectors_total;
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
struct bch_ioctl_data_event {
|
||||
__u8 type;
|
||||
__u8 pad[7];
|
||||
union {
|
||||
struct bch_ioctl_data_progress p;
|
||||
__u64 pad2[15];
|
||||
};
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
struct bch_ioctl_dev_usage {
|
||||
__u8 state;
|
||||
__u8 alive;
|
||||
__u8 pad[6];
|
||||
__u32 dev;
|
||||
|
||||
__u32 bucket_size;
|
||||
__u64 nr_buckets;
|
||||
|
||||
__u64 buckets[BCH_DATA_NR];
|
||||
__u64 sectors[BCH_DATA_NR];
|
||||
};
|
||||
|
||||
struct bch_ioctl_fs_usage {
|
||||
__u64 capacity;
|
||||
__u64 used;
|
||||
__u64 online_reserved;
|
||||
__u64 persistent_reserved[BCH_REPLICAS_MAX];
|
||||
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_USAGE: query filesystem disk space usage
|
||||
*
|
||||
* Returns disk space usage broken out by data type, number of replicas, and
|
||||
* by component device
|
||||
*
|
||||
* @nr_devices - number of devices userspace allocated space for in @devs
|
||||
*
|
||||
* On success, @fs and @devs will be filled out appropriately and devs[i].alive
|
||||
* will indicate if a device was present in that slot
|
||||
*
|
||||
* Returns -ERANGE if @nr_devices was too small
|
||||
*/
|
||||
struct bch_ioctl_usage {
|
||||
__u16 nr_devices;
|
||||
__u16 pad[3];
|
||||
|
||||
struct bch_ioctl_fs_usage fs;
|
||||
struct bch_ioctl_dev_usage devs[0];
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_READ_SUPER: read filesystem superblock
|
||||
*
|
||||
* Equivalent to reading the superblock directly from the block device, except
|
||||
* avoids racing with the kernel writing the superblock or having to figure out
|
||||
* which block device to read
|
||||
*
|
||||
* @sb - buffer to read into
|
||||
* @size - size of userspace allocated buffer
|
||||
* @dev - device to read superblock for, if BCH_READ_DEV flag is
|
||||
* specified
|
||||
*
|
||||
* Returns -ERANGE if buffer provided is too small
|
||||
*/
|
||||
struct bch_ioctl_read_super {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
__u64 dev;
|
||||
__u64 size;
|
||||
__u64 sb;
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
|
||||
* determine if disk is a (online) member - if so, returns device's index
|
||||
*
|
||||
* Returns -ENOENT if not found
|
||||
*/
|
||||
struct bch_ioctl_disk_get_idx {
|
||||
__u64 dev;
|
||||
};
|
||||
|
||||
/*
|
||||
* BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
|
||||
*
|
||||
* @dev - member to resize
|
||||
* @nbuckets - new number of buckets
|
||||
*/
|
||||
struct bch_ioctl_disk_resize {
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
__u64 dev;
|
||||
__u64 nbuckets;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_IOCTL_H */
|
1164
fs/bcachefs/bkey.c
Normal file
1164
fs/bcachefs/bkey.c
Normal file
File diff suppressed because it is too large
Load Diff
627
fs/bcachefs/bkey.h
Normal file
627
fs/bcachefs/bkey.h
Normal file
@ -0,0 +1,627 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BKEY_H
|
||||
#define _BCACHEFS_BKEY_H
|
||||
|
||||
#include <linux/bug.h>
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "vstructs.h"
|
||||
|
||||
#if 0
|
||||
|
||||
/*
|
||||
* compiled unpack functions are disabled, pending a new interface for
|
||||
* dynamically allocating executable memory:
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
#define HAVE_BCACHEFS_COMPILED_UNPACK 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
void bch2_to_binary(char *, const u64 *, unsigned);
|
||||
|
||||
/* bkey with split value, const */
|
||||
struct bkey_s_c {
|
||||
const struct bkey *k;
|
||||
const struct bch_val *v;
|
||||
};
|
||||
|
||||
/* bkey with split value */
|
||||
struct bkey_s {
|
||||
union {
|
||||
struct {
|
||||
struct bkey *k;
|
||||
struct bch_val *v;
|
||||
};
|
||||
struct bkey_s_c s_c;
|
||||
};
|
||||
};
|
||||
|
||||
#define bkey_next(_k) vstruct_next(_k)
|
||||
|
||||
static inline unsigned bkey_val_u64s(const struct bkey *k)
|
||||
{
|
||||
return k->u64s - BKEY_U64s;
|
||||
}
|
||||
|
||||
static inline size_t bkey_val_bytes(const struct bkey *k)
|
||||
{
|
||||
return bkey_val_u64s(k) * sizeof(u64);
|
||||
}
|
||||
|
||||
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
|
||||
{
|
||||
k->u64s = BKEY_U64s + val_u64s;
|
||||
}
|
||||
|
||||
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
|
||||
{
|
||||
k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
|
||||
}
|
||||
|
||||
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED)
|
||||
|
||||
#define bkey_whiteout(_k) \
|
||||
((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
|
||||
|
||||
#define bkey_packed_typecheck(_k) \
|
||||
({ \
|
||||
BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
|
||||
!type_is(_k, struct bkey_packed *)); \
|
||||
type_is(_k, struct bkey_packed *); \
|
||||
})
|
||||
|
||||
enum bkey_lr_packed {
|
||||
BKEY_PACKED_BOTH,
|
||||
BKEY_PACKED_RIGHT,
|
||||
BKEY_PACKED_LEFT,
|
||||
BKEY_PACKED_NONE,
|
||||
};
|
||||
|
||||
#define bkey_lr_packed_typecheck(_l, _r) \
|
||||
(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
|
||||
|
||||
#define bkey_lr_packed(_l, _r) \
|
||||
((_l)->format + ((_r)->format << 1))
|
||||
|
||||
#define bkey_copy(_dst, _src) \
|
||||
do { \
|
||||
BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
|
||||
!type_is(_dst, struct bkey_packed *)); \
|
||||
BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
|
||||
!type_is(_src, struct bkey_packed *)); \
|
||||
EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
|
||||
(u64 *) (_dst) < (u64 *) (_src) + \
|
||||
((struct bkey *) (_src))->u64s); \
|
||||
\
|
||||
__memmove_u64s_down((_dst), (_src), \
|
||||
((struct bkey *) (_src))->u64s); \
|
||||
} while (0)
|
||||
|
||||
struct btree;
|
||||
|
||||
struct bkey_format_state {
|
||||
u64 field_min[BKEY_NR_FIELDS];
|
||||
u64 field_max[BKEY_NR_FIELDS];
|
||||
};
|
||||
|
||||
void bch2_bkey_format_init(struct bkey_format_state *);
|
||||
void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
|
||||
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
|
||||
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
|
||||
const char *bch2_bkey_format_validate(struct bkey_format *);
|
||||
|
||||
__pure
|
||||
unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
|
||||
const struct bkey_packed *,
|
||||
const struct bkey_packed *);
|
||||
__pure
|
||||
unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
|
||||
|
||||
__pure
|
||||
int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
|
||||
const struct bkey_packed *,
|
||||
const struct btree *);
|
||||
|
||||
__pure
|
||||
int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
|
||||
const struct bkey_packed *,
|
||||
const struct bpos *);
|
||||
|
||||
__pure
|
||||
int __bch2_bkey_cmp_packed(const struct bkey_packed *,
|
||||
const struct bkey_packed *,
|
||||
const struct btree *);
|
||||
|
||||
__pure
|
||||
int __bch2_bkey_cmp_left_packed(const struct btree *,
|
||||
const struct bkey_packed *,
|
||||
const struct bpos *);
|
||||
|
||||
static inline __pure
|
||||
int bkey_cmp_left_packed(const struct btree *b,
|
||||
const struct bkey_packed *l, const struct bpos *r)
|
||||
{
|
||||
return __bch2_bkey_cmp_left_packed(b, l, r);
|
||||
}
|
||||
|
||||
/*
|
||||
* we prefer to pass bpos by ref, but it's often enough terribly convenient to
|
||||
* pass it by by val... as much as I hate c++, const ref would be nice here:
|
||||
*/
|
||||
__pure __flatten
|
||||
static inline int bkey_cmp_left_packed_byval(const struct btree *b,
|
||||
const struct bkey_packed *l,
|
||||
struct bpos r)
|
||||
{
|
||||
return bkey_cmp_left_packed(b, l, &r);
|
||||
}
|
||||
|
||||
/*
|
||||
* If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
|
||||
* skip dispatching on k->format:
|
||||
*/
|
||||
#define bkey_cmp_packed(_b, _l, _r) \
|
||||
({ \
|
||||
int _cmp; \
|
||||
\
|
||||
switch (bkey_lr_packed_typecheck(_l, _r)) { \
|
||||
case BKEY_PACKED_NONE: \
|
||||
_cmp = bkey_cmp(((struct bkey *) (_l))->p, \
|
||||
((struct bkey *) (_r))->p); \
|
||||
break; \
|
||||
case BKEY_PACKED_LEFT: \
|
||||
_cmp = bkey_cmp_left_packed((_b), \
|
||||
(struct bkey_packed *) (_l), \
|
||||
&((struct bkey *) (_r))->p); \
|
||||
break; \
|
||||
case BKEY_PACKED_RIGHT: \
|
||||
_cmp = -bkey_cmp_left_packed((_b), \
|
||||
(struct bkey_packed *) (_r), \
|
||||
&((struct bkey *) (_l))->p); \
|
||||
break; \
|
||||
case BKEY_PACKED_BOTH: \
|
||||
_cmp = __bch2_bkey_cmp_packed((void *) (_l), \
|
||||
(void *) (_r), (_b)); \
|
||||
break; \
|
||||
} \
|
||||
_cmp; \
|
||||
})
|
||||
|
||||
#if 1
|
||||
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
|
||||
{
|
||||
if (l.inode != r.inode)
|
||||
return l.inode < r.inode ? -1 : 1;
|
||||
if (l.offset != r.offset)
|
||||
return l.offset < r.offset ? -1 : 1;
|
||||
if (l.snapshot != r.snapshot)
|
||||
return l.snapshot < r.snapshot ? -1 : 1;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int bkey_cmp(struct bpos l, struct bpos r);
|
||||
#endif
|
||||
|
||||
static inline struct bpos bpos_min(struct bpos l, struct bpos r)
|
||||
{
|
||||
return bkey_cmp(l, r) < 0 ? l : r;
|
||||
}
|
||||
|
||||
void bch2_bpos_swab(struct bpos *);
|
||||
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
|
||||
|
||||
static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
|
||||
{
|
||||
return (l.hi > r.hi) - (l.hi < r.hi) ?:
|
||||
(l.lo > r.lo) - (l.lo < r.lo);
|
||||
}
|
||||
|
||||
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
|
||||
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
|
||||
|
||||
static __always_inline int bversion_zero(struct bversion v)
|
||||
{
|
||||
return !bversion_cmp(v, ZERO_VERSION);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
/* statement expressions confusing unlikely()? */
|
||||
#define bkey_packed(_k) \
|
||||
({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
|
||||
(_k)->format != KEY_FORMAT_CURRENT; })
|
||||
#else
|
||||
#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* It's safe to treat an unpacked bkey as a packed one, but not the reverse
|
||||
*/
|
||||
static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
|
||||
{
|
||||
return (struct bkey_packed *) k;
|
||||
}
|
||||
|
||||
static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
|
||||
{
|
||||
return (const struct bkey_packed *) k;
|
||||
}
|
||||
|
||||
static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
|
||||
{
|
||||
return bkey_packed(k) ? NULL : (struct bkey_i *) k;
|
||||
}
|
||||
|
||||
static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
|
||||
{
|
||||
return bkey_packed(k) ? NULL : (const struct bkey *) k;
|
||||
}
|
||||
|
||||
static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
|
||||
{
|
||||
return format->bits_per_field[BKEY_FIELD_INODE] +
|
||||
format->bits_per_field[BKEY_FIELD_OFFSET] +
|
||||
format->bits_per_field[BKEY_FIELD_SNAPSHOT];
|
||||
}
|
||||
|
||||
static inline struct bpos bkey_successor(struct bpos p)
|
||||
{
|
||||
struct bpos ret = p;
|
||||
|
||||
if (!++ret.offset)
|
||||
BUG_ON(!++ret.inode);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct bpos bkey_predecessor(struct bpos p)
|
||||
{
|
||||
struct bpos ret = p;
|
||||
|
||||
if (!ret.offset--)
|
||||
BUG_ON(!ret.inode--);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline u64 bkey_start_offset(const struct bkey *k)
|
||||
{
|
||||
return k->p.offset - k->size;
|
||||
}
|
||||
|
||||
static inline struct bpos bkey_start_pos(const struct bkey *k)
|
||||
{
|
||||
return (struct bpos) {
|
||||
.inode = k->p.inode,
|
||||
.offset = bkey_start_offset(k),
|
||||
.snapshot = k->p.snapshot,
|
||||
};
|
||||
}
|
||||
|
||||
/* Packed helpers */
|
||||
|
||||
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
|
||||
const struct bkey_packed *k)
|
||||
{
|
||||
unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
|
||||
|
||||
EBUG_ON(k->u64s < ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
|
||||
const struct bkey_packed *k)
|
||||
{
|
||||
return bkeyp_key_u64s(format, k) * sizeof(u64);
|
||||
}
|
||||
|
||||
static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
|
||||
const struct bkey_packed *k)
|
||||
{
|
||||
return k->u64s - bkeyp_key_u64s(format, k);
|
||||
}
|
||||
|
||||
static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
|
||||
const struct bkey_packed *k)
|
||||
{
|
||||
return bkeyp_val_u64s(format, k) * sizeof(u64);
|
||||
}
|
||||
|
||||
static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
|
||||
struct bkey_packed *k, unsigned val_u64s)
|
||||
{
|
||||
k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
|
||||
}
|
||||
|
||||
#define bkeyp_val(_format, _k) \
|
||||
((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
|
||||
|
||||
extern const struct bkey_format bch2_bkey_format_current;
|
||||
|
||||
bool bch2_bkey_transform(const struct bkey_format *,
|
||||
struct bkey_packed *,
|
||||
const struct bkey_format *,
|
||||
const struct bkey_packed *);
|
||||
|
||||
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
|
||||
const struct bkey_packed *);
|
||||
|
||||
#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
|
||||
struct bpos __bkey_unpack_pos(const struct bkey_format *,
|
||||
const struct bkey_packed *);
|
||||
#endif
|
||||
|
||||
bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
|
||||
const struct bkey_format *);
|
||||
|
||||
enum bkey_pack_pos_ret {
|
||||
BKEY_PACK_POS_EXACT,
|
||||
BKEY_PACK_POS_SMALLER,
|
||||
BKEY_PACK_POS_FAIL,
|
||||
};
|
||||
|
||||
enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
|
||||
const struct btree *);
|
||||
|
||||
static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
|
||||
const struct btree *b)
|
||||
{
|
||||
return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
|
||||
}
|
||||
|
||||
void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
|
||||
const struct bkey_packed *);
|
||||
bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
|
||||
const struct bkey_format *);
|
||||
|
||||
static inline u64 bkey_field_max(const struct bkey_format *f,
|
||||
enum bch_bkey_fields nr)
|
||||
{
|
||||
return f->bits_per_field[nr] < 64
|
||||
? (le64_to_cpu(f->field_offset[nr]) +
|
||||
~(~0ULL << f->bits_per_field[nr]))
|
||||
: U64_MAX;
|
||||
}
|
||||
|
||||
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
|
||||
|
||||
int bch2_compile_bkey_format(const struct bkey_format *, void *);
|
||||
|
||||
#else
|
||||
|
||||
static inline int bch2_compile_bkey_format(const struct bkey_format *format,
|
||||
void *out) { return 0; }
|
||||
|
||||
#endif
|
||||
|
||||
static inline void bkey_reassemble(struct bkey_i *dst,
|
||||
struct bkey_s_c src)
|
||||
{
|
||||
BUG_ON(bkey_packed(src.k));
|
||||
dst->k = *src.k;
|
||||
memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
|
||||
}
|
||||
|
||||
#define bkey_s_null ((struct bkey_s) { .k = NULL })
|
||||
#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
|
||||
|
||||
#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
|
||||
#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
|
||||
|
||||
static inline struct bkey_s bkey_to_s(struct bkey *k)
|
||||
{
|
||||
return (struct bkey_s) { .k = k, .v = NULL };
|
||||
}
|
||||
|
||||
static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
|
||||
{
|
||||
return (struct bkey_s_c) { .k = k, .v = NULL };
|
||||
}
|
||||
|
||||
static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
|
||||
{
|
||||
return (struct bkey_s) { .k = &k->k, .v = &k->v };
|
||||
}
|
||||
|
||||
static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
|
||||
{
|
||||
return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
|
||||
}
|
||||
|
||||
/*
|
||||
* For a given type of value (e.g. struct bch_extent), generates the types for
|
||||
* bkey + bch_extent - inline, split, split const - and also all the conversion
|
||||
* functions, which also check that the value is of the correct type.
|
||||
*
|
||||
* We use anonymous unions for upcasting - e.g. converting from e.g. a
|
||||
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
|
||||
* functions.
|
||||
*/
|
||||
#define __BKEY_VAL_ACCESSORS(name, nr, _assert) \
|
||||
struct bkey_s_c_##name { \
|
||||
union { \
|
||||
struct { \
|
||||
const struct bkey *k; \
|
||||
const struct bch_##name *v; \
|
||||
}; \
|
||||
struct bkey_s_c s_c; \
|
||||
}; \
|
||||
}; \
|
||||
\
|
||||
struct bkey_s_##name { \
|
||||
union { \
|
||||
struct { \
|
||||
struct bkey *k; \
|
||||
struct bch_##name *v; \
|
||||
}; \
|
||||
struct bkey_s_c_##name c; \
|
||||
struct bkey_s s; \
|
||||
struct bkey_s_c s_c; \
|
||||
}; \
|
||||
}; \
|
||||
\
|
||||
static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
|
||||
{ \
|
||||
_assert(k->k.type, nr); \
|
||||
return container_of(&k->k, struct bkey_i_##name, k); \
|
||||
} \
|
||||
\
|
||||
static inline const struct bkey_i_##name * \
|
||||
bkey_i_to_##name##_c(const struct bkey_i *k) \
|
||||
{ \
|
||||
_assert(k->k.type, nr); \
|
||||
return container_of(&k->k, struct bkey_i_##name, k); \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
|
||||
{ \
|
||||
_assert(k.k->type, nr); \
|
||||
return (struct bkey_s_##name) { \
|
||||
.k = k.k, \
|
||||
.v = container_of(k.v, struct bch_##name, v), \
|
||||
}; \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
|
||||
{ \
|
||||
_assert(k.k->type, nr); \
|
||||
return (struct bkey_s_c_##name) { \
|
||||
.k = k.k, \
|
||||
.v = container_of(k.v, struct bch_##name, v), \
|
||||
}; \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
|
||||
{ \
|
||||
return (struct bkey_s_##name) { \
|
||||
.k = &k->k, \
|
||||
.v = &k->v, \
|
||||
}; \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_s_c_##name \
|
||||
name##_i_to_s_c(const struct bkey_i_##name *k) \
|
||||
{ \
|
||||
return (struct bkey_s_c_##name) { \
|
||||
.k = &k->k, \
|
||||
.v = &k->v, \
|
||||
}; \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
|
||||
{ \
|
||||
_assert(k->k.type, nr); \
|
||||
return (struct bkey_s_##name) { \
|
||||
.k = &k->k, \
|
||||
.v = container_of(&k->v, struct bch_##name, v), \
|
||||
}; \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_s_c_##name \
|
||||
bkey_i_to_s_c_##name(const struct bkey_i *k) \
|
||||
{ \
|
||||
_assert(k->k.type, nr); \
|
||||
return (struct bkey_s_c_##name) { \
|
||||
.k = &k->k, \
|
||||
.v = container_of(&k->v, struct bch_##name, v), \
|
||||
}; \
|
||||
} \
|
||||
\
|
||||
static inline struct bch_##name * \
|
||||
bkey_p_##name##_val(const struct bkey_format *f, \
|
||||
struct bkey_packed *k) \
|
||||
{ \
|
||||
return container_of(bkeyp_val(f, k), struct bch_##name, v); \
|
||||
} \
|
||||
\
|
||||
static inline const struct bch_##name * \
|
||||
bkey_p_c_##name##_val(const struct bkey_format *f, \
|
||||
const struct bkey_packed *k) \
|
||||
{ \
|
||||
return container_of(bkeyp_val(f, k), struct bch_##name, v); \
|
||||
} \
|
||||
\
|
||||
static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
|
||||
{ \
|
||||
struct bkey_i_##name *k = \
|
||||
container_of(&_k->k, struct bkey_i_##name, k); \
|
||||
\
|
||||
bkey_init(&k->k); \
|
||||
memset(&k->v, 0, sizeof(k->v)); \
|
||||
k->k.type = nr; \
|
||||
set_bkey_val_bytes(&k->k, sizeof(k->v)); \
|
||||
\
|
||||
return k; \
|
||||
}
|
||||
|
||||
#define __BKEY_VAL_ASSERT(_type, _nr) EBUG_ON(_type != _nr)
|
||||
|
||||
#define BKEY_VAL_ACCESSORS(name, _nr) \
|
||||
static inline void __bch_##name##_assert(u8 type, u8 nr) \
|
||||
{ \
|
||||
EBUG_ON(type != _nr); \
|
||||
} \
|
||||
\
|
||||
__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
|
||||
|
||||
BKEY_VAL_ACCESSORS(cookie, KEY_TYPE_COOKIE);
|
||||
|
||||
static inline void __bch2_extent_assert(u8 type, u8 nr)
|
||||
{
|
||||
EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
|
||||
}
|
||||
|
||||
__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch2_extent_assert);
|
||||
BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION);
|
||||
|
||||
BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS);
|
||||
BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV);
|
||||
BKEY_VAL_ACCESSORS(inode_generation, BCH_INODE_GENERATION);
|
||||
|
||||
BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT);
|
||||
|
||||
BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
|
||||
|
||||
BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
|
||||
|
||||
BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
|
||||
|
||||
/* byte order helpers */
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
|
||||
static inline unsigned high_word_offset(const struct bkey_format *f)
|
||||
{
|
||||
return f->key_u64s - 1;
|
||||
}
|
||||
|
||||
#define high_bit_offset 0
|
||||
#define nth_word(p, n) ((p) - (n))
|
||||
|
||||
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
|
||||
static inline unsigned high_word_offset(const struct bkey_format *f)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define high_bit_offset KEY_PACKED_BITS_START
|
||||
#define nth_word(p, n) ((p) + (n))
|
||||
|
||||
#else
|
||||
#error edit for your odd byteorder.
|
||||
#endif
|
||||
|
||||
#define high_word(f, k) ((k)->_data + high_word_offset(f))
|
||||
#define next_word(p) nth_word(p, 1)
|
||||
#define prev_word(p) nth_word(p, -1)
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_bkey_pack_test(void);
|
||||
#else
|
||||
static inline void bch2_bkey_pack_test(void) {}
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_BKEY_H */
|
192
fs/bcachefs/bkey_methods.c
Normal file
192
fs/bcachefs/bkey_methods.c
Normal file
@ -0,0 +1,192 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_types.h"
|
||||
#include "alloc.h"
|
||||
#include "dirent.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "inode.h"
|
||||
#include "quota.h"
|
||||
#include "xattr.h"
|
||||
|
||||
const struct bkey_ops bch2_bkey_ops[] = {
|
||||
[BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops,
|
||||
[BKEY_TYPE_INODES] = bch2_bkey_inode_ops,
|
||||
[BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops,
|
||||
[BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
|
||||
[BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
|
||||
[BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
|
||||
[BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
|
||||
};
|
||||
|
||||
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
const struct bkey_ops *ops = &bch2_bkey_ops[type];
|
||||
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_DELETED:
|
||||
case KEY_TYPE_DISCARD:
|
||||
return NULL;
|
||||
|
||||
case KEY_TYPE_ERROR:
|
||||
return bkey_val_bytes(k.k) != 0
|
||||
? "value size should be zero"
|
||||
: NULL;
|
||||
|
||||
case KEY_TYPE_COOKIE:
|
||||
return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
|
||||
? "incorrect value size"
|
||||
: NULL;
|
||||
|
||||
default:
|
||||
if (k.k->type < KEY_TYPE_GENERIC_NR)
|
||||
return "invalid type";
|
||||
|
||||
return ops->key_invalid(c, k);
|
||||
}
|
||||
}
|
||||
|
||||
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
const struct bkey_ops *ops = &bch2_bkey_ops[type];
|
||||
|
||||
if (k.k->u64s < BKEY_U64s)
|
||||
return "u64s too small";
|
||||
|
||||
if (!ops->is_extents) {
|
||||
if (k.k->size)
|
||||
return "nonzero size field";
|
||||
} else {
|
||||
if ((k.k->size == 0) != bkey_deleted(k.k))
|
||||
return "bad size field";
|
||||
}
|
||||
|
||||
if (ops->is_extents &&
|
||||
!k.k->size &&
|
||||
!bkey_deleted(k.k))
|
||||
return "zero size field";
|
||||
|
||||
if (k.k->p.snapshot)
|
||||
return "nonzero snapshot";
|
||||
|
||||
if (type != BKEY_TYPE_BTREE &&
|
||||
!bkey_cmp(k.k->p, POS_MAX))
|
||||
return "POS_MAX key";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
return __bch2_bkey_invalid(c, type, k) ?:
|
||||
bch2_bkey_val_invalid(c, type, k);
|
||||
}
|
||||
|
||||
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
|
||||
{
|
||||
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
|
||||
return "key before start of btree node";
|
||||
|
||||
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
|
||||
return "key past end of btree node";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
|
||||
{
|
||||
enum bkey_type type = btree_node_type(b);
|
||||
const struct bkey_ops *ops = &bch2_bkey_ops[type];
|
||||
const char *invalid;
|
||||
|
||||
BUG_ON(!k.k->u64s);
|
||||
|
||||
invalid = bch2_bkey_invalid(c, type, k) ?:
|
||||
bch2_bkey_in_btree_node(b, k);
|
||||
if (invalid) {
|
||||
char buf[160];
|
||||
|
||||
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
|
||||
bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
|
||||
return;
|
||||
}
|
||||
|
||||
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
|
||||
ops->key_debugcheck)
|
||||
ops->key_debugcheck(c, b, k);
|
||||
}
|
||||
|
||||
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
|
||||
|
||||
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
|
||||
{
|
||||
char *out = buf, *end = buf + size;
|
||||
|
||||
p("u64s %u type %u ", k->u64s, k->type);
|
||||
|
||||
if (bkey_cmp(k->p, POS_MAX))
|
||||
p("%llu:%llu", k->p.inode, k->p.offset);
|
||||
else
|
||||
p("POS_MAX");
|
||||
|
||||
p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
|
||||
char *buf, size_t size, struct bkey_s_c k)
|
||||
{
|
||||
const struct bkey_ops *ops = &bch2_bkey_ops[type];
|
||||
char *out = buf, *end = buf + size;
|
||||
|
||||
switch (k.k->type) {
|
||||
case KEY_TYPE_DELETED:
|
||||
p(" deleted");
|
||||
break;
|
||||
case KEY_TYPE_DISCARD:
|
||||
p(" discard");
|
||||
break;
|
||||
case KEY_TYPE_ERROR:
|
||||
p(" error");
|
||||
break;
|
||||
case KEY_TYPE_COOKIE:
|
||||
p(" cookie");
|
||||
break;
|
||||
default:
|
||||
if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
|
||||
ops->val_to_text(c, buf, size, k);
|
||||
break;
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
|
||||
char *buf, size_t size, struct bkey_s_c k)
|
||||
{
|
||||
char *out = buf, *end = buf + size;
|
||||
|
||||
out += bch2_bkey_to_text(out, end - out, k.k);
|
||||
out += scnprintf(out, end - out, ": ");
|
||||
out += bch2_val_to_text(c, type, out, end - out, k);
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
void bch2_bkey_swab(enum bkey_type type,
|
||||
const struct bkey_format *f,
|
||||
struct bkey_packed *k)
|
||||
{
|
||||
const struct bkey_ops *ops = &bch2_bkey_ops[type];
|
||||
|
||||
bch2_bkey_swab_key(f, k);
|
||||
|
||||
if (ops->swab)
|
||||
ops->swab(f, k);
|
||||
}
|
87
fs/bcachefs/bkey_methods.h
Normal file
87
fs/bcachefs/bkey_methods.h
Normal file
@ -0,0 +1,87 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BKEY_METHODS_H
|
||||
#define _BCACHEFS_BKEY_METHODS_H
|
||||
|
||||
#include "bkey.h"
|
||||
|
||||
#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
|
||||
|
||||
enum bkey_type {
|
||||
DEFINE_BCH_BTREE_IDS()
|
||||
BKEY_TYPE_BTREE,
|
||||
};
|
||||
|
||||
#undef DEF_BTREE_ID
|
||||
|
||||
/* Type of a key in btree @id at level @level: */
|
||||
static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
|
||||
{
|
||||
return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_ptrs(enum bkey_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BKEY_TYPE_BTREE:
|
||||
case BKEY_TYPE_EXTENTS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct bch_fs;
|
||||
struct btree;
|
||||
struct bkey;
|
||||
|
||||
enum merge_result {
|
||||
BCH_MERGE_NOMERGE,
|
||||
|
||||
/*
|
||||
* The keys were mergeable, but would have overflowed size - so instead
|
||||
* l was changed to the maximum size, and both keys were modified:
|
||||
*/
|
||||
BCH_MERGE_PARTIAL,
|
||||
BCH_MERGE_MERGE,
|
||||
};
|
||||
|
||||
typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
|
||||
struct bkey_s);
|
||||
typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
|
||||
struct btree *,
|
||||
struct bkey_i *, struct bkey_i *);
|
||||
|
||||
struct bkey_ops {
|
||||
/* Returns reason for being invalid if invalid, else NULL: */
|
||||
const char * (*key_invalid)(const struct bch_fs *,
|
||||
struct bkey_s_c);
|
||||
void (*key_debugcheck)(struct bch_fs *, struct btree *,
|
||||
struct bkey_s_c);
|
||||
void (*val_to_text)(struct bch_fs *, char *,
|
||||
size_t, struct bkey_s_c);
|
||||
void (*swab)(const struct bkey_format *, struct bkey_packed *);
|
||||
key_filter_fn key_normalize;
|
||||
key_merge_fn key_merge;
|
||||
bool is_extents;
|
||||
};
|
||||
|
||||
const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
|
||||
struct bkey_s_c);
|
||||
const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
|
||||
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
|
||||
const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
|
||||
|
||||
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
|
||||
|
||||
int bch2_bkey_to_text(char *, size_t, const struct bkey *);
|
||||
int bch2_val_to_text(struct bch_fs *, enum bkey_type,
|
||||
char *, size_t, struct bkey_s_c);
|
||||
int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
|
||||
char *, size_t, struct bkey_s_c);
|
||||
|
||||
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
|
||||
struct bkey_packed *);
|
||||
|
||||
extern const struct bkey_ops bch2_bkey_ops[];
|
||||
|
||||
#endif /* _BCACHEFS_BKEY_METHODS_H */
|
1849
fs/bcachefs/bset.c
Normal file
1849
fs/bcachefs/bset.c
Normal file
File diff suppressed because it is too large
Load Diff
668
fs/bcachefs/bset.h
Normal file
668
fs/bcachefs/bset.h
Normal file
@ -0,0 +1,668 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BSET_H
|
||||
#define _BCACHEFS_BSET_H
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
#include "bkey.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_types.h"
|
||||
#include "util.h" /* for time_stats */
|
||||
#include "vstructs.h"
|
||||
|
||||
/*
|
||||
* BKEYS:
|
||||
*
|
||||
* A bkey contains a key, a size field, a variable number of pointers, and some
|
||||
* ancillary flag bits.
|
||||
*
|
||||
* We use two different functions for validating bkeys, bkey_invalid and
|
||||
* bkey_deleted().
|
||||
*
|
||||
* The one exception to the rule that ptr_invalid() filters out invalid keys is
|
||||
* that it also filters out keys of size 0 - these are keys that have been
|
||||
* completely overwritten. It'd be safe to delete these in memory while leaving
|
||||
* them on disk, just unnecessary work - so we filter them out when resorting
|
||||
* instead.
|
||||
*
|
||||
* We can't filter out stale keys when we're resorting, because garbage
|
||||
* collection needs to find them to ensure bucket gens don't wrap around -
|
||||
* unless we're rewriting the btree node those stale keys still exist on disk.
|
||||
*
|
||||
* We also implement functions here for removing some number of sectors from the
|
||||
* front or the back of a bkey - this is mainly used for fixing overlapping
|
||||
* extents, by removing the overlapping sectors from the older key.
|
||||
*
|
||||
* BSETS:
|
||||
*
|
||||
* A bset is an array of bkeys laid out contiguously in memory in sorted order,
|
||||
* along with a header. A btree node is made up of a number of these, written at
|
||||
* different times.
|
||||
*
|
||||
* There could be many of them on disk, but we never allow there to be more than
|
||||
* 4 in memory - we lazily resort as needed.
|
||||
*
|
||||
* We implement code here for creating and maintaining auxiliary search trees
|
||||
* (described below) for searching an individial bset, and on top of that we
|
||||
* implement a btree iterator.
|
||||
*
|
||||
* BTREE ITERATOR:
|
||||
*
|
||||
* Most of the code in bcache doesn't care about an individual bset - it needs
|
||||
* to search entire btree nodes and iterate over them in sorted order.
|
||||
*
|
||||
* The btree iterator code serves both functions; it iterates through the keys
|
||||
* in a btree node in sorted order, starting from either keys after a specific
|
||||
* point (if you pass it a search key) or the start of the btree node.
|
||||
*
|
||||
* AUXILIARY SEARCH TREES:
|
||||
*
|
||||
* Since keys are variable length, we can't use a binary search on a bset - we
|
||||
* wouldn't be able to find the start of the next key. But binary searches are
|
||||
* slow anyways, due to terrible cache behaviour; bcache originally used binary
|
||||
* searches and that code topped out at under 50k lookups/second.
|
||||
*
|
||||
* So we need to construct some sort of lookup table. Since we only insert keys
|
||||
* into the last (unwritten) set, most of the keys within a given btree node are
|
||||
* usually in sets that are mostly constant. We use two different types of
|
||||
* lookup tables to take advantage of this.
|
||||
*
|
||||
* Both lookup tables share in common that they don't index every key in the
|
||||
* set; they index one key every BSET_CACHELINE bytes, and then a linear search
|
||||
* is used for the rest.
|
||||
*
|
||||
* For sets that have been written to disk and are no longer being inserted
|
||||
* into, we construct a binary search tree in an array - traversing a binary
|
||||
* search tree in an array gives excellent locality of reference and is very
|
||||
* fast, since both children of any node are adjacent to each other in memory
|
||||
* (and their grandchildren, and great grandchildren...) - this means
|
||||
* prefetching can be used to great effect.
|
||||
*
|
||||
* It's quite useful performance wise to keep these nodes small - not just
|
||||
* because they're more likely to be in L2, but also because we can prefetch
|
||||
* more nodes on a single cacheline and thus prefetch more iterations in advance
|
||||
* when traversing this tree.
|
||||
*
|
||||
* Nodes in the auxiliary search tree must contain both a key to compare against
|
||||
* (we don't want to fetch the key from the set, that would defeat the purpose),
|
||||
* and a pointer to the key. We use a few tricks to compress both of these.
|
||||
*
|
||||
* To compress the pointer, we take advantage of the fact that one node in the
|
||||
* search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
|
||||
* a function (to_inorder()) that takes the index of a node in a binary tree and
|
||||
* returns what its index would be in an inorder traversal, so we only have to
|
||||
* store the low bits of the offset.
|
||||
*
|
||||
* The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
|
||||
* compress that, we take advantage of the fact that when we're traversing the
|
||||
* search tree at every iteration we know that both our search key and the key
|
||||
* we're looking for lie within some range - bounded by our previous
|
||||
* comparisons. (We special case the start of a search so that this is true even
|
||||
* at the root of the tree).
|
||||
*
|
||||
* So we know the key we're looking for is between a and b, and a and b don't
|
||||
* differ higher than bit 50, we don't need to check anything higher than bit
|
||||
* 50.
|
||||
*
|
||||
* We don't usually need the rest of the bits, either; we only need enough bits
|
||||
* to partition the key range we're currently checking. Consider key n - the
|
||||
* key our auxiliary search tree node corresponds to, and key p, the key
|
||||
* immediately preceding n. The lowest bit we need to store in the auxiliary
|
||||
* search tree is the highest bit that differs between n and p.
|
||||
*
|
||||
* Note that this could be bit 0 - we might sometimes need all 80 bits to do the
|
||||
* comparison. But we'd really like our nodes in the auxiliary search tree to be
|
||||
* of fixed size.
|
||||
*
|
||||
* The solution is to make them fixed size, and when we're constructing a node
|
||||
* check if p and n differed in the bits we needed them to. If they don't we
|
||||
* flag that node, and when doing lookups we fallback to comparing against the
|
||||
* real key. As long as this doesn't happen to often (and it seems to reliably
|
||||
* happen a bit less than 1% of the time), we win - even on failures, that key
|
||||
* is then more likely to be in cache than if we were doing binary searches all
|
||||
* the way, since we're touching so much less memory.
|
||||
*
|
||||
* The keys in the auxiliary search tree are stored in (software) floating
|
||||
* point, with an exponent and a mantissa. The exponent needs to be big enough
|
||||
* to address all the bits in the original key, but the number of bits in the
|
||||
* mantissa is somewhat arbitrary; more bits just gets us fewer failures.
|
||||
*
|
||||
* We need 7 bits for the exponent and 3 bits for the key's offset (since keys
|
||||
* are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
|
||||
* We need one node per 128 bytes in the btree node, which means the auxiliary
|
||||
* search trees take up 3% as much memory as the btree itself.
|
||||
*
|
||||
* Constructing these auxiliary search trees is moderately expensive, and we
|
||||
* don't want to be constantly rebuilding the search tree for the last set
|
||||
* whenever we insert another key into it. For the unwritten set, we use a much
|
||||
* simpler lookup table - it's just a flat array, so index i in the lookup table
|
||||
* corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
|
||||
* within each byte range works the same as with the auxiliary search trees.
|
||||
*
|
||||
* These are much easier to keep up to date when we insert a key - we do it
|
||||
* somewhat lazily; when we shift a key up we usually just increment the pointer
|
||||
* to it, only when it would overflow do we go to the trouble of finding the
|
||||
* first key in that range of bytes again.
|
||||
*/
|
||||
|
||||
extern bool bch2_expensive_debug_checks;
|
||||
|
||||
static inline bool btree_keys_expensive_checks(const struct btree *b)
|
||||
{
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
return bch2_expensive_debug_checks || *b->expensive_debug_checks;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
enum bset_aux_tree_type {
|
||||
BSET_NO_AUX_TREE,
|
||||
BSET_RO_AUX_TREE,
|
||||
BSET_RW_AUX_TREE,
|
||||
};
|
||||
|
||||
#define BSET_TREE_NR_TYPES 3
|
||||
|
||||
#define BSET_NO_AUX_TREE_VAL (U16_MAX)
|
||||
#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
|
||||
|
||||
static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
|
||||
{
|
||||
switch (t->extra) {
|
||||
case BSET_NO_AUX_TREE_VAL:
|
||||
EBUG_ON(t->size);
|
||||
return BSET_NO_AUX_TREE;
|
||||
case BSET_RW_AUX_TREE_VAL:
|
||||
EBUG_ON(!t->size);
|
||||
return BSET_RW_AUX_TREE;
|
||||
default:
|
||||
EBUG_ON(!t->size);
|
||||
return BSET_RO_AUX_TREE;
|
||||
}
|
||||
}
|
||||
|
||||
typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
|
||||
|
||||
static inline void
|
||||
__bkey_unpack_key_format_checked(const struct btree *b,
|
||||
struct bkey *dst,
|
||||
const struct bkey_packed *src)
|
||||
{
|
||||
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
|
||||
{
|
||||
compiled_unpack_fn unpack_fn = b->aux_data;
|
||||
unpack_fn(dst, src);
|
||||
|
||||
if (btree_keys_expensive_checks(b)) {
|
||||
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
|
||||
|
||||
/*
|
||||
* hack around a harmless race when compacting whiteouts
|
||||
* for a write:
|
||||
*/
|
||||
dst2.needs_whiteout = dst->needs_whiteout;
|
||||
|
||||
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
|
||||
}
|
||||
}
|
||||
#else
|
||||
*dst = __bch2_bkey_unpack_key(&b->format, src);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline struct bkey
|
||||
bkey_unpack_key_format_checked(const struct btree *b,
|
||||
const struct bkey_packed *src)
|
||||
{
|
||||
struct bkey dst;
|
||||
|
||||
__bkey_unpack_key_format_checked(b, &dst, src);
|
||||
return dst;
|
||||
}
|
||||
|
||||
static inline void __bkey_unpack_key(const struct btree *b,
|
||||
struct bkey *dst,
|
||||
const struct bkey_packed *src)
|
||||
{
|
||||
if (likely(bkey_packed(src)))
|
||||
__bkey_unpack_key_format_checked(b, dst, src);
|
||||
else
|
||||
*dst = *packed_to_bkey_c(src);
|
||||
}
|
||||
|
||||
/**
|
||||
* bkey_unpack_key -- unpack just the key, not the value
|
||||
*/
|
||||
static inline struct bkey bkey_unpack_key(const struct btree *b,
|
||||
const struct bkey_packed *src)
|
||||
{
|
||||
return likely(bkey_packed(src))
|
||||
? bkey_unpack_key_format_checked(b, src)
|
||||
: *packed_to_bkey_c(src);
|
||||
}
|
||||
|
||||
static inline struct bpos
|
||||
bkey_unpack_pos_format_checked(const struct btree *b,
|
||||
const struct bkey_packed *src)
|
||||
{
|
||||
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
|
||||
return bkey_unpack_key_format_checked(b, src).p;
|
||||
#else
|
||||
return __bkey_unpack_pos(&b->format, src);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline struct bpos bkey_unpack_pos(const struct btree *b,
|
||||
const struct bkey_packed *src)
|
||||
{
|
||||
return likely(bkey_packed(src))
|
||||
? bkey_unpack_pos_format_checked(b, src)
|
||||
: packed_to_bkey_c(src)->p;
|
||||
}
|
||||
|
||||
/* Disassembled bkeys */
|
||||
|
||||
static inline struct bkey_s_c bkey_disassemble(struct btree *b,
|
||||
const struct bkey_packed *k,
|
||||
struct bkey *u)
|
||||
{
|
||||
__bkey_unpack_key(b, u, k);
|
||||
|
||||
return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
|
||||
}
|
||||
|
||||
/* non const version: */
|
||||
static inline struct bkey_s __bkey_disassemble(struct btree *b,
|
||||
struct bkey_packed *k,
|
||||
struct bkey *u)
|
||||
{
|
||||
__bkey_unpack_key(b, u, k);
|
||||
|
||||
return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
|
||||
}
|
||||
|
||||
#define for_each_bset(_b, _t) \
|
||||
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
|
||||
|
||||
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
|
||||
{
|
||||
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
|
||||
}
|
||||
|
||||
static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
|
||||
{
|
||||
return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
|
||||
}
|
||||
|
||||
static inline void bch2_bset_set_no_aux_tree(struct btree *b,
|
||||
struct bset_tree *t)
|
||||
{
|
||||
BUG_ON(t < b->set);
|
||||
|
||||
for (; t < b->set + ARRAY_SIZE(b->set); t++) {
|
||||
t->size = 0;
|
||||
t->extra = BSET_NO_AUX_TREE_VAL;
|
||||
t->aux_data_offset = U16_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void btree_node_set_format(struct btree *b,
|
||||
struct bkey_format f)
|
||||
{
|
||||
int len;
|
||||
|
||||
b->format = f;
|
||||
b->nr_key_bits = bkey_format_key_bits(&f);
|
||||
|
||||
len = bch2_compile_bkey_format(&b->format, b->aux_data);
|
||||
BUG_ON(len < 0 || len > U8_MAX);
|
||||
|
||||
b->unpack_fn_len = len;
|
||||
|
||||
bch2_bset_set_no_aux_tree(b, b->set);
|
||||
}
|
||||
|
||||
static inline struct bset *bset_next_set(struct btree *b,
|
||||
unsigned block_bytes)
|
||||
{
|
||||
struct bset *i = btree_bset_last(b);
|
||||
|
||||
EBUG_ON(!is_power_of_2(block_bytes));
|
||||
|
||||
return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
|
||||
}
|
||||
|
||||
void bch2_btree_keys_free(struct btree *);
|
||||
int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
|
||||
void bch2_btree_keys_init(struct btree *, bool *);
|
||||
|
||||
void bch2_bset_init_first(struct btree *, struct bset *);
|
||||
void bch2_bset_init_next(struct bch_fs *, struct btree *,
|
||||
struct btree_node_entry *);
|
||||
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
|
||||
void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
|
||||
struct bkey_packed *);
|
||||
|
||||
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
|
||||
struct bkey_packed *, struct bkey_i *, unsigned);
|
||||
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
|
||||
|
||||
/* Bkey utility code */
|
||||
|
||||
/* packed or unpacked */
|
||||
static inline int bkey_cmp_p_or_unp(const struct btree *b,
|
||||
const struct bkey_packed *l,
|
||||
const struct bkey_packed *r_packed,
|
||||
struct bpos *r)
|
||||
{
|
||||
EBUG_ON(r_packed && !bkey_packed(r_packed));
|
||||
|
||||
if (unlikely(!bkey_packed(l)))
|
||||
return bkey_cmp(packed_to_bkey_c(l)->p, *r);
|
||||
|
||||
if (likely(r_packed))
|
||||
return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
|
||||
|
||||
return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
|
||||
}
|
||||
|
||||
/* Returns true if @k is after iterator position @pos */
|
||||
static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
|
||||
struct bpos *pos,
|
||||
const struct bkey_packed *k,
|
||||
bool strictly_greater)
|
||||
{
|
||||
int cmp = bkey_cmp_left_packed(b, k, pos);
|
||||
|
||||
return cmp > 0 ||
|
||||
(cmp == 0 && !strictly_greater && !bkey_deleted(k));
|
||||
}
|
||||
|
||||
static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
|
||||
struct bpos pos,
|
||||
const struct bkey_packed *pos_packed,
|
||||
const struct bkey_packed *k,
|
||||
bool strictly_greater)
|
||||
{
|
||||
int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
|
||||
|
||||
return cmp > 0 ||
|
||||
(cmp == 0 && !strictly_greater && !bkey_deleted(k));
|
||||
}
|
||||
|
||||
struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
|
||||
|
||||
struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
|
||||
struct bkey_packed *, unsigned);
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
|
||||
{
|
||||
return bch2_bkey_prev_filter(b, t, k, 0);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
|
||||
{
|
||||
return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
|
||||
}
|
||||
|
||||
enum bch_extent_overlap {
|
||||
BCH_EXTENT_OVERLAP_ALL = 0,
|
||||
BCH_EXTENT_OVERLAP_BACK = 1,
|
||||
BCH_EXTENT_OVERLAP_FRONT = 2,
|
||||
BCH_EXTENT_OVERLAP_MIDDLE = 3,
|
||||
};
|
||||
|
||||
/* Returns how k overlaps with m */
|
||||
static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
|
||||
const struct bkey *m)
|
||||
{
|
||||
int cmp1 = bkey_cmp(k->p, m->p) < 0;
|
||||
int cmp2 = bkey_cmp(bkey_start_pos(k),
|
||||
bkey_start_pos(m)) > 0;
|
||||
|
||||
return (cmp1 << 1) + cmp2;
|
||||
}
|
||||
|
||||
/* Btree key iteration */
|
||||
|
||||
static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
|
||||
bool is_extents)
|
||||
{
|
||||
iter->is_extents = is_extents;
|
||||
memset(iter->data, 0, sizeof(iter->data));
|
||||
}
|
||||
|
||||
void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
|
||||
const struct bkey_packed *,
|
||||
const struct bkey_packed *);
|
||||
void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
|
||||
struct bpos, bool, bool);
|
||||
void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
|
||||
struct btree *, bool);
|
||||
struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
|
||||
struct btree *,
|
||||
struct bset_tree *);
|
||||
|
||||
void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
|
||||
void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
|
||||
struct btree_node_iter_set *);
|
||||
void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
|
||||
|
||||
#define btree_node_iter_for_each(_iter, _set) \
|
||||
for (_set = (_iter)->data; \
|
||||
_set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \
|
||||
(_set)->k != (_set)->end; \
|
||||
_set++)
|
||||
|
||||
static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
|
||||
unsigned i)
|
||||
{
|
||||
return iter->data[i].k == iter->data[i].end;
|
||||
}
|
||||
|
||||
static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
|
||||
{
|
||||
return __btree_node_iter_set_end(iter, 0);
|
||||
}
|
||||
|
||||
static inline int __btree_node_iter_cmp(bool is_extents,
|
||||
struct btree *b,
|
||||
struct bkey_packed *l,
|
||||
struct bkey_packed *r)
|
||||
{
|
||||
/*
|
||||
* For non extents, when keys compare equal the deleted keys have to
|
||||
* come first - so that bch2_btree_node_iter_next_check() can detect
|
||||
* duplicate nondeleted keys (and possibly other reasons?)
|
||||
*
|
||||
* For extents, bkey_deleted() is used as a proxy for k->size == 0, so
|
||||
* deleted keys have to sort last.
|
||||
*/
|
||||
return bkey_cmp_packed(b, l, r)
|
||||
?: (is_extents
|
||||
? (int) bkey_deleted(l) - (int) bkey_deleted(r)
|
||||
: (int) bkey_deleted(r) - (int) bkey_deleted(l))
|
||||
?: (l > r) - (l < r);
|
||||
}
|
||||
|
||||
static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
|
||||
struct btree *b,
|
||||
struct btree_node_iter_set l,
|
||||
struct btree_node_iter_set r)
|
||||
{
|
||||
return __btree_node_iter_cmp(iter->is_extents, b,
|
||||
__btree_node_offset_to_key(b, l.k),
|
||||
__btree_node_offset_to_key(b, r.k));
|
||||
}
|
||||
|
||||
static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
|
||||
struct btree *b,
|
||||
const struct bkey_packed *k,
|
||||
const struct bkey_packed *end)
|
||||
{
|
||||
if (k != end) {
|
||||
struct btree_node_iter_set *pos;
|
||||
|
||||
btree_node_iter_for_each(iter, pos)
|
||||
;
|
||||
|
||||
BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
|
||||
*pos = (struct btree_node_iter_set) {
|
||||
__btree_node_key_to_offset(b, k),
|
||||
__btree_node_key_to_offset(b, end)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
|
||||
struct btree *b)
|
||||
{
|
||||
return __btree_node_offset_to_key(b, iter->data->k);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
|
||||
struct btree *b,
|
||||
unsigned min_key_type)
|
||||
{
|
||||
while (!bch2_btree_node_iter_end(iter)) {
|
||||
struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
|
||||
|
||||
if (k->type >= min_key_type)
|
||||
return k;
|
||||
|
||||
bch2_btree_node_iter_advance(iter, b);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
|
||||
struct btree *b)
|
||||
{
|
||||
return bch2_btree_node_iter_peek_filter(iter, b, 0);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
|
||||
{
|
||||
return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
|
||||
{
|
||||
struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
|
||||
|
||||
if (ret)
|
||||
bch2_btree_node_iter_advance(iter, b);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
|
||||
struct btree *, unsigned);
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
|
||||
{
|
||||
return bch2_btree_node_iter_prev_filter(iter, b, 0);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
|
||||
{
|
||||
return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterates over all _live_ keys - skipping deleted (and potentially
|
||||
* overlapping) keys
|
||||
*/
|
||||
#define for_each_btree_node_key(b, k, iter, _is_extents) \
|
||||
for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
|
||||
((k) = bch2_btree_node_iter_peek(iter, b)); \
|
||||
bch2_btree_node_iter_advance(iter, b))
|
||||
|
||||
struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
|
||||
struct btree *,
|
||||
struct bkey *);
|
||||
|
||||
#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
|
||||
for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
|
||||
(k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
|
||||
bch2_btree_node_iter_advance(iter, b))
|
||||
|
||||
/* Accounting: */
|
||||
|
||||
static inline void btree_keys_account_key(struct btree_nr_keys *n,
|
||||
unsigned bset,
|
||||
struct bkey_packed *k,
|
||||
int sign)
|
||||
{
|
||||
n->live_u64s += k->u64s * sign;
|
||||
n->bset_u64s[bset] += k->u64s * sign;
|
||||
|
||||
if (bkey_packed(k))
|
||||
n->packed_keys += sign;
|
||||
else
|
||||
n->unpacked_keys += sign;
|
||||
}
|
||||
|
||||
#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
|
||||
btree_keys_account_key(_nr, _bset_idx, _k, 1)
|
||||
#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
|
||||
btree_keys_account_key(_nr, _bset_idx, _k, -1)
|
||||
|
||||
struct bset_stats {
|
||||
struct {
|
||||
size_t nr, bytes;
|
||||
} sets[BSET_TREE_NR_TYPES];
|
||||
|
||||
size_t floats;
|
||||
size_t failed_unpacked;
|
||||
size_t failed_prev;
|
||||
size_t failed_overflow;
|
||||
};
|
||||
|
||||
void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
|
||||
int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
|
||||
char *, size_t);
|
||||
|
||||
/* Debug stuff */
|
||||
|
||||
void bch2_dump_bset(struct btree *, struct bset *, unsigned);
|
||||
void bch2_dump_btree_node(struct btree *);
|
||||
void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
|
||||
void __bch2_verify_btree_nr_keys(struct btree *);
|
||||
void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
|
||||
void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
|
||||
struct bkey_packed *);
|
||||
|
||||
#else
|
||||
|
||||
static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
|
||||
static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
||||
struct btree *b) {}
|
||||
static inline void bch2_verify_key_order(struct btree *b,
|
||||
struct btree_node_iter *iter,
|
||||
struct bkey_packed *where) {}
|
||||
#endif
|
||||
|
||||
static inline void bch2_verify_btree_nr_keys(struct btree *b)
|
||||
{
|
||||
if (btree_keys_expensive_checks(b))
|
||||
__bch2_verify_btree_nr_keys(b);
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_BSET_H */
|
941
fs/bcachefs/btree_cache.c
Normal file
941
fs/bcachefs/btree_cache.c
Normal file
@ -0,0 +1,941 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_io.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_locking.h"
|
||||
#include "debug.h"
|
||||
#include "extents.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/prefetch.h>
|
||||
|
||||
#define DEF_BTREE_ID(kwd, val, name) name,
|
||||
|
||||
const char * const bch2_btree_ids[] = {
|
||||
DEFINE_BCH_BTREE_IDS()
|
||||
NULL
|
||||
};
|
||||
|
||||
#undef DEF_BTREE_ID
|
||||
|
||||
void bch2_recalc_btree_reserve(struct bch_fs *c)
|
||||
{
|
||||
unsigned i, reserve = 16;
|
||||
|
||||
if (!c->btree_roots[0].b)
|
||||
reserve += 8;
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
if (c->btree_roots[i].b)
|
||||
reserve += min_t(unsigned, 1,
|
||||
c->btree_roots[i].b->level) * 8;
|
||||
|
||||
c->btree_cache.reserve = reserve;
|
||||
}
|
||||
|
||||
static inline unsigned btree_cache_can_free(struct btree_cache *bc)
|
||||
{
|
||||
return max_t(int, 0, bc->used - bc->reserve);
|
||||
}
|
||||
|
||||
static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
EBUG_ON(btree_node_write_in_flight(b));
|
||||
|
||||
kvpfree(b->data, btree_bytes(c));
|
||||
b->data = NULL;
|
||||
bch2_btree_keys_free(b);
|
||||
}
|
||||
|
||||
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
__btree_node_data_free(c, b);
|
||||
bc->used--;
|
||||
list_move(&b->list, &bc->freed);
|
||||
}
|
||||
|
||||
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
|
||||
const void *obj)
|
||||
{
|
||||
const struct btree *b = obj;
|
||||
const u64 *v = arg->key;
|
||||
|
||||
return PTR_HASH(&b->key) == *v ? 0 : 1;
|
||||
}
|
||||
|
||||
static const struct rhashtable_params bch_btree_cache_params = {
|
||||
.head_offset = offsetof(struct btree, hash),
|
||||
.key_offset = offsetof(struct btree, key.v),
|
||||
.key_len = sizeof(struct bch_extent_ptr),
|
||||
.obj_cmpfn = bch2_btree_cache_cmp_fn,
|
||||
};
|
||||
|
||||
static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
b->data = kvpmalloc(btree_bytes(c), gfp);
|
||||
if (!b->data)
|
||||
goto err;
|
||||
|
||||
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
|
||||
goto err;
|
||||
|
||||
bc->used++;
|
||||
list_move(&b->list, &bc->freeable);
|
||||
return;
|
||||
err:
|
||||
kvpfree(b->data, btree_bytes(c));
|
||||
b->data = NULL;
|
||||
list_move(&b->list, &bc->freed);
|
||||
}
|
||||
|
||||
static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
|
||||
{
|
||||
struct btree *b = kzalloc(sizeof(struct btree), gfp);
|
||||
if (!b)
|
||||
return NULL;
|
||||
|
||||
bkey_extent_init(&b->key);
|
||||
six_lock_init(&b->lock);
|
||||
lockdep_set_novalidate_class(&b->lock);
|
||||
INIT_LIST_HEAD(&b->list);
|
||||
INIT_LIST_HEAD(&b->write_blocked);
|
||||
|
||||
btree_node_data_alloc(c, b, gfp);
|
||||
return b->data ? b : NULL;
|
||||
}
|
||||
|
||||
/* Btree in memory cache - hash table */
|
||||
|
||||
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
||||
{
|
||||
rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
|
||||
|
||||
/* Cause future lookups for this node to fail: */
|
||||
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
||||
}
|
||||
|
||||
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
|
||||
{
|
||||
return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
|
||||
bch_btree_cache_params);
|
||||
}
|
||||
|
||||
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
|
||||
unsigned level, enum btree_id id)
|
||||
{
|
||||
int ret;
|
||||
|
||||
b->level = level;
|
||||
b->btree_id = id;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
ret = __bch2_btree_node_hash_insert(bc, b);
|
||||
if (!ret)
|
||||
list_add(&b->list, &bc->live);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
__flatten
|
||||
static inline struct btree *btree_cache_find(struct btree_cache *bc,
|
||||
const struct bkey_i *k)
|
||||
{
|
||||
return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
|
||||
bch_btree_cache_params);
|
||||
}
|
||||
|
||||
/*
|
||||
* this version is for btree nodes that have already been freed (we're not
|
||||
* reaping a real btree node)
|
||||
*/
|
||||
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
int ret = 0;
|
||||
|
||||
lockdep_assert_held(&bc->lock);
|
||||
|
||||
if (!six_trylock_intent(&b->lock))
|
||||
return -ENOMEM;
|
||||
|
||||
if (!six_trylock_write(&b->lock))
|
||||
goto out_unlock_intent;
|
||||
|
||||
if (btree_node_noevict(b))
|
||||
goto out_unlock;
|
||||
|
||||
if (!btree_node_may_write(b))
|
||||
goto out_unlock;
|
||||
|
||||
if (btree_node_dirty(b) ||
|
||||
btree_node_write_in_flight(b) ||
|
||||
btree_node_read_in_flight(b)) {
|
||||
if (!flush)
|
||||
goto out_unlock;
|
||||
|
||||
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
|
||||
/*
|
||||
* Using the underscore version because we don't want to compact
|
||||
* bsets after the write, since this node is about to be evicted
|
||||
* - unless btree verify mode is enabled, since it runs out of
|
||||
* the post write cleanup:
|
||||
*/
|
||||
if (verify_btree_ondisk(c))
|
||||
bch2_btree_node_write(c, b, SIX_LOCK_intent);
|
||||
else
|
||||
__bch2_btree_node_write(c, b, SIX_LOCK_read);
|
||||
|
||||
/* wait for any in flight btree write */
|
||||
btree_node_wait_on_io(b);
|
||||
}
|
||||
out:
|
||||
if (PTR_HASH(&b->key) && !ret)
|
||||
trace_btree_node_reap(c, b);
|
||||
return ret;
|
||||
out_unlock:
|
||||
six_unlock_write(&b->lock);
|
||||
out_unlock_intent:
|
||||
six_unlock_intent(&b->lock);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
return __btree_node_reclaim(c, b, false);
|
||||
}
|
||||
|
||||
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
return __btree_node_reclaim(c, b, true);
|
||||
}
|
||||
|
||||
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct bch_fs *c = container_of(shrink, struct bch_fs,
|
||||
btree_cache.shrink);
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b, *t;
|
||||
unsigned long nr = sc->nr_to_scan;
|
||||
unsigned long can_free;
|
||||
unsigned long touched = 0;
|
||||
unsigned long freed = 0;
|
||||
unsigned i;
|
||||
|
||||
if (btree_shrinker_disabled(c))
|
||||
return SHRINK_STOP;
|
||||
|
||||
/* Return -1 if we can't do anything right now */
|
||||
if (sc->gfp_mask & __GFP_IO)
|
||||
mutex_lock(&bc->lock);
|
||||
else if (!mutex_trylock(&bc->lock))
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* It's _really_ critical that we don't free too many btree nodes - we
|
||||
* have to always leave ourselves a reserve. The reserve is how we
|
||||
* guarantee that allocating memory for a new btree node can always
|
||||
* succeed, so that inserting keys into the btree can always succeed and
|
||||
* IO can always make forward progress:
|
||||
*/
|
||||
nr /= btree_pages(c);
|
||||
can_free = btree_cache_can_free(bc);
|
||||
nr = min_t(unsigned long, nr, can_free);
|
||||
|
||||
i = 0;
|
||||
list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
||||
touched++;
|
||||
|
||||
if (freed >= nr)
|
||||
break;
|
||||
|
||||
if (++i > 3 &&
|
||||
!btree_node_reclaim(c, b)) {
|
||||
btree_node_data_free(c, b);
|
||||
six_unlock_write(&b->lock);
|
||||
six_unlock_intent(&b->lock);
|
||||
freed++;
|
||||
}
|
||||
}
|
||||
restart:
|
||||
list_for_each_entry_safe(b, t, &bc->live, list) {
|
||||
touched++;
|
||||
|
||||
if (freed >= nr) {
|
||||
/* Save position */
|
||||
if (&t->list != &bc->live)
|
||||
list_move_tail(&bc->live, &t->list);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!btree_node_accessed(b) &&
|
||||
!btree_node_reclaim(c, b)) {
|
||||
/* can't call bch2_btree_node_hash_remove under lock */
|
||||
freed++;
|
||||
if (&t->list != &bc->live)
|
||||
list_move_tail(&bc->live, &t->list);
|
||||
|
||||
btree_node_data_free(c, b);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
six_unlock_write(&b->lock);
|
||||
six_unlock_intent(&b->lock);
|
||||
|
||||
if (freed >= nr)
|
||||
goto out;
|
||||
|
||||
if (sc->gfp_mask & __GFP_IO)
|
||||
mutex_lock(&bc->lock);
|
||||
else if (!mutex_trylock(&bc->lock))
|
||||
goto out;
|
||||
goto restart;
|
||||
} else
|
||||
clear_btree_node_accessed(b);
|
||||
}
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
out:
|
||||
return (unsigned long) freed * btree_pages(c);
|
||||
}
|
||||
|
||||
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
|
||||
struct shrink_control *sc)
|
||||
{
|
||||
struct bch_fs *c = container_of(shrink, struct bch_fs,
|
||||
btree_cache.shrink);
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
if (btree_shrinker_disabled(c))
|
||||
return 0;
|
||||
|
||||
return btree_cache_can_free(bc) * btree_pages(c);
|
||||
}
|
||||
|
||||
void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
unsigned i;
|
||||
|
||||
if (bc->shrink.list.next)
|
||||
unregister_shrinker(&bc->shrink);
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
if (c->verify_data)
|
||||
list_move(&c->verify_data->list, &bc->live);
|
||||
|
||||
kvpfree(c->verify_ondisk, btree_bytes(c));
|
||||
#endif
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
if (c->btree_roots[i].b)
|
||||
list_add(&c->btree_roots[i].b->list, &bc->live);
|
||||
|
||||
list_splice(&bc->freeable, &bc->live);
|
||||
|
||||
while (!list_empty(&bc->live)) {
|
||||
b = list_first_entry(&bc->live, struct btree, list);
|
||||
|
||||
BUG_ON(btree_node_read_in_flight(b) ||
|
||||
btree_node_write_in_flight(b));
|
||||
|
||||
if (btree_node_dirty(b))
|
||||
bch2_btree_complete_write(c, b, btree_current_write(b));
|
||||
clear_btree_node_dirty(b);
|
||||
|
||||
btree_node_data_free(c, b);
|
||||
}
|
||||
|
||||
while (!list_empty(&bc->freed)) {
|
||||
b = list_first_entry(&bc->freed, struct btree, list);
|
||||
list_del(&b->list);
|
||||
kfree(b);
|
||||
}
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
if (bc->table_init_done)
|
||||
rhashtable_destroy(&bc->table);
|
||||
}
|
||||
|
||||
int bch2_fs_btree_cache_init(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
pr_verbose_init(c->opts, "");
|
||||
|
||||
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
bc->table_init_done = true;
|
||||
|
||||
bch2_recalc_btree_reserve(c);
|
||||
|
||||
for (i = 0; i < bc->reserve; i++)
|
||||
if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_splice_init(&bc->live, &bc->freeable);
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
mutex_init(&c->verify_lock);
|
||||
|
||||
c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
|
||||
if (!c->verify_ondisk) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
|
||||
if (!c->verify_data) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_del_init(&c->verify_data->list);
|
||||
#endif
|
||||
|
||||
bc->shrink.count_objects = bch2_btree_cache_count;
|
||||
bc->shrink.scan_objects = bch2_btree_cache_scan;
|
||||
bc->shrink.seeks = 4;
|
||||
bc->shrink.batch = btree_pages(c) * 2;
|
||||
register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
|
||||
out:
|
||||
pr_verbose_init(c->opts, "ret %i", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
|
||||
{
|
||||
mutex_init(&bc->lock);
|
||||
INIT_LIST_HEAD(&bc->live);
|
||||
INIT_LIST_HEAD(&bc->freeable);
|
||||
INIT_LIST_HEAD(&bc->freed);
|
||||
}
|
||||
|
||||
/*
|
||||
* We can only have one thread cannibalizing other cached btree nodes at a time,
|
||||
* or we'll deadlock. We use an open coded mutex to ensure that, which a
|
||||
* cannibalize_bucket() will take. This means every time we unlock the root of
|
||||
* the btree, we need to release this lock if we have it held.
|
||||
*/
|
||||
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
if (bc->alloc_lock == current) {
|
||||
trace_btree_node_cannibalize_unlock(c);
|
||||
bc->alloc_lock = NULL;
|
||||
closure_wake_up(&bc->alloc_wait);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct task_struct *old;
|
||||
|
||||
old = cmpxchg(&bc->alloc_lock, NULL, current);
|
||||
if (old == NULL || old == current)
|
||||
goto success;
|
||||
|
||||
if (!cl) {
|
||||
trace_btree_node_cannibalize_lock_fail(c);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
closure_wait(&bc->alloc_wait, cl);
|
||||
|
||||
/* Try again, after adding ourselves to waitlist */
|
||||
old = cmpxchg(&bc->alloc_lock, NULL, current);
|
||||
if (old == NULL || old == current) {
|
||||
/* We raced */
|
||||
closure_wake_up(&bc->alloc_wait);
|
||||
goto success;
|
||||
}
|
||||
|
||||
trace_btree_node_cannibalize_lock_fail(c);
|
||||
return -EAGAIN;
|
||||
|
||||
success:
|
||||
trace_btree_node_cannibalize_lock(c);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct btree *btree_node_cannibalize(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
|
||||
list_for_each_entry_reverse(b, &bc->live, list)
|
||||
if (!btree_node_reclaim(c, b))
|
||||
return b;
|
||||
|
||||
while (1) {
|
||||
list_for_each_entry_reverse(b, &bc->live, list)
|
||||
if (!btree_node_write_and_reclaim(c, b))
|
||||
return b;
|
||||
|
||||
/*
|
||||
* Rare case: all nodes were intent-locked.
|
||||
* Just busy-wait.
|
||||
*/
|
||||
WARN_ONCE(1, "btree cache cannibalize failed\n");
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
u64 start_time = local_clock();
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
|
||||
/*
|
||||
* btree_free() doesn't free memory; it sticks the node on the end of
|
||||
* the list. Check if there's any freed nodes there:
|
||||
*/
|
||||
list_for_each_entry(b, &bc->freeable, list)
|
||||
if (!btree_node_reclaim(c, b))
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* We never free struct btree itself, just the memory that holds the on
|
||||
* disk node. Check the freed list before allocating a new one:
|
||||
*/
|
||||
list_for_each_entry(b, &bc->freed, list)
|
||||
if (!btree_node_reclaim(c, b)) {
|
||||
btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
|
||||
if (b->data)
|
||||
goto out_unlock;
|
||||
|
||||
six_unlock_write(&b->lock);
|
||||
six_unlock_intent(&b->lock);
|
||||
goto err;
|
||||
}
|
||||
|
||||
b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
|
||||
if (!b)
|
||||
goto err;
|
||||
|
||||
BUG_ON(!six_trylock_intent(&b->lock));
|
||||
BUG_ON(!six_trylock_write(&b->lock));
|
||||
out_unlock:
|
||||
BUG_ON(btree_node_hashed(b));
|
||||
BUG_ON(btree_node_write_in_flight(b));
|
||||
|
||||
list_del_init(&b->list);
|
||||
mutex_unlock(&bc->lock);
|
||||
out:
|
||||
b->flags = 0;
|
||||
b->written = 0;
|
||||
b->nsets = 0;
|
||||
b->sib_u64s[0] = 0;
|
||||
b->sib_u64s[1] = 0;
|
||||
b->whiteout_u64s = 0;
|
||||
b->uncompacted_whiteout_u64s = 0;
|
||||
bch2_btree_keys_init(b, &c->expensive_debug_checks);
|
||||
|
||||
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
||||
start_time);
|
||||
|
||||
return b;
|
||||
err:
|
||||
/* Try to cannibalize another cached btree node: */
|
||||
if (bc->alloc_lock == current) {
|
||||
b = btree_node_cannibalize(c);
|
||||
list_del_init(&b->list);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
bch2_btree_node_hash_remove(bc, b);
|
||||
|
||||
trace_btree_node_cannibalize(c);
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_unlock(&bc->lock);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* Slowpath, don't want it inlined into btree_iter_traverse() */
|
||||
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
const struct bkey_i *k,
|
||||
unsigned level,
|
||||
enum six_lock_type lock_type,
|
||||
bool sync)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
|
||||
/*
|
||||
* Parent node must be locked, else we could read in a btree node that's
|
||||
* been freed:
|
||||
*/
|
||||
BUG_ON(!btree_node_locked(iter, level + 1));
|
||||
BUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
|
||||
b = bch2_btree_node_mem_alloc(c);
|
||||
if (IS_ERR(b))
|
||||
return b;
|
||||
|
||||
bkey_copy(&b->key, k);
|
||||
if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
|
||||
/* raced with another fill: */
|
||||
|
||||
/* mark as unhashed... */
|
||||
bkey_i_to_extent(&b->key)->v._data[0] = 0;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
list_add(&b->list, &bc->freeable);
|
||||
mutex_unlock(&bc->lock);
|
||||
|
||||
six_unlock_write(&b->lock);
|
||||
six_unlock_intent(&b->lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the btree node wasn't cached, we can't drop our lock on
|
||||
* the parent until after it's added to the cache - because
|
||||
* otherwise we could race with a btree_split() freeing the node
|
||||
* we're trying to lock.
|
||||
*
|
||||
* But the deadlock described below doesn't exist in this case,
|
||||
* so it's safe to not drop the parent lock until here:
|
||||
*/
|
||||
if (btree_node_read_locked(iter, level + 1))
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
bch2_btree_node_read(c, b, sync);
|
||||
|
||||
six_unlock_write(&b->lock);
|
||||
|
||||
if (!sync) {
|
||||
six_unlock_intent(&b->lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (lock_type == SIX_LOCK_read)
|
||||
six_lock_downgrade(&b->lock);
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch_btree_node_get - find a btree node in the cache and lock it, reading it
|
||||
* in from disk if necessary.
|
||||
*
|
||||
* If IO is necessary and running under generic_make_request, returns -EAGAIN.
|
||||
*
|
||||
* The btree node will have either a read or a write lock held, depending on
|
||||
* the @write parameter.
|
||||
*/
|
||||
struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
|
||||
const struct bkey_i *k, unsigned level,
|
||||
enum six_lock_type lock_type,
|
||||
bool may_drop_locks)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
struct bset_tree *t;
|
||||
|
||||
/*
|
||||
* XXX: locking optimization
|
||||
*
|
||||
* we can make the locking looser here - caller can drop lock on parent
|
||||
* node before locking child node (and potentially blocking): we just
|
||||
* have to have bch2_btree_node_fill() call relock on the parent and
|
||||
* return -EINTR if that fails
|
||||
*/
|
||||
EBUG_ON(!btree_node_locked(iter, level + 1));
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
retry:
|
||||
rcu_read_lock();
|
||||
b = btree_cache_find(bc, k);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(!b)) {
|
||||
/*
|
||||
* We must have the parent locked to call bch2_btree_node_fill(),
|
||||
* else we could read in a btree node from disk that's been
|
||||
* freed:
|
||||
*/
|
||||
b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
|
||||
|
||||
/* We raced and found the btree node in the cache */
|
||||
if (!b)
|
||||
goto retry;
|
||||
|
||||
if (IS_ERR(b))
|
||||
return b;
|
||||
} else {
|
||||
/*
|
||||
* There's a potential deadlock with splits and insertions into
|
||||
* interior nodes we have to avoid:
|
||||
*
|
||||
* The other thread might be holding an intent lock on the node
|
||||
* we want, and they want to update its parent node so they're
|
||||
* going to upgrade their intent lock on the parent node to a
|
||||
* write lock.
|
||||
*
|
||||
* But if we're holding a read lock on the parent, and we're
|
||||
* trying to get the intent lock they're holding, we deadlock.
|
||||
*
|
||||
* So to avoid this we drop the read locks on parent nodes when
|
||||
* we're starting to take intent locks - and handle the race.
|
||||
*
|
||||
* The race is that they might be about to free the node we
|
||||
* want, and dropping our read lock on the parent node lets them
|
||||
* update the parent marking the node we want as freed, and then
|
||||
* free it:
|
||||
*
|
||||
* To guard against this, btree nodes are evicted from the cache
|
||||
* when they're freed - and PTR_HASH() is zeroed out, which we
|
||||
* check for after we lock the node.
|
||||
*
|
||||
* Then, bch2_btree_node_relock() on the parent will fail - because
|
||||
* the parent was modified, when the pointer to the node we want
|
||||
* was removed - and we'll bail out:
|
||||
*/
|
||||
if (btree_node_read_locked(iter, level + 1))
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
if (!btree_node_lock(b, k->k.p, level, iter,
|
||||
lock_type, may_drop_locks))
|
||||
return ERR_PTR(-EINTR);
|
||||
|
||||
if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
|
||||
b->level != level ||
|
||||
race_fault())) {
|
||||
six_unlock_type(&b->lock, lock_type);
|
||||
if (bch2_btree_node_relock(iter, level + 1))
|
||||
goto retry;
|
||||
|
||||
return ERR_PTR(-EINTR);
|
||||
}
|
||||
}
|
||||
|
||||
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
|
||||
prefetch(b->aux_data);
|
||||
|
||||
for_each_bset(b, t) {
|
||||
void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
||||
|
||||
prefetch(p + L1_CACHE_BYTES * 0);
|
||||
prefetch(p + L1_CACHE_BYTES * 1);
|
||||
prefetch(p + L1_CACHE_BYTES * 2);
|
||||
}
|
||||
|
||||
/* avoid atomic set bit if it's not needed: */
|
||||
if (btree_node_accessed(b))
|
||||
set_btree_node_accessed(b);
|
||||
|
||||
if (unlikely(btree_node_read_error(b))) {
|
||||
six_unlock_type(&b->lock, lock_type);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
EBUG_ON(b->btree_id != iter->btree_id ||
|
||||
BTREE_NODE_LEVEL(b->data) != level ||
|
||||
bkey_cmp(b->data->max_key, k->k.p));
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
struct btree *b,
|
||||
bool may_drop_locks,
|
||||
enum btree_node_sibling sib)
|
||||
{
|
||||
struct btree *parent;
|
||||
struct btree_node_iter node_iter;
|
||||
struct bkey_packed *k;
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct btree *ret = NULL;
|
||||
unsigned level = b->level;
|
||||
|
||||
parent = btree_iter_node(iter, level + 1);
|
||||
if (!parent)
|
||||
return NULL;
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level + 1))
|
||||
goto out_upgrade;
|
||||
|
||||
node_iter = iter->l[parent->level].iter;
|
||||
|
||||
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
|
||||
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
|
||||
|
||||
k = sib == btree_prev_sib
|
||||
? bch2_btree_node_iter_prev(&node_iter, parent)
|
||||
: (bch2_btree_node_iter_advance(&node_iter, parent),
|
||||
bch2_btree_node_iter_peek(&node_iter, parent));
|
||||
if (!k)
|
||||
goto out;
|
||||
|
||||
bch2_bkey_unpack(parent, &tmp.k, k);
|
||||
|
||||
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
|
||||
SIX_LOCK_intent, may_drop_locks);
|
||||
|
||||
if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
|
||||
struct btree_iter *linked;
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level + 1))
|
||||
goto out_upgrade;
|
||||
|
||||
/*
|
||||
* We might have got -EINTR because trylock failed, and we're
|
||||
* holding other locks that would cause us to deadlock:
|
||||
*/
|
||||
for_each_linked_btree_iter(iter, linked)
|
||||
if (btree_iter_cmp(iter, linked) < 0)
|
||||
__bch2_btree_iter_unlock(linked);
|
||||
|
||||
if (sib == btree_prev_sib)
|
||||
btree_node_unlock(iter, level);
|
||||
|
||||
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
|
||||
SIX_LOCK_intent, may_drop_locks);
|
||||
|
||||
/*
|
||||
* before btree_iter_relock() calls btree_iter_verify_locks():
|
||||
*/
|
||||
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level)) {
|
||||
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
|
||||
|
||||
if (!IS_ERR(ret)) {
|
||||
six_unlock_intent(&ret->lock);
|
||||
ret = ERR_PTR(-EINTR);
|
||||
}
|
||||
}
|
||||
|
||||
bch2_btree_iter_relock(iter);
|
||||
}
|
||||
out:
|
||||
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
bch2_btree_iter_verify_locks(iter);
|
||||
|
||||
BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
|
||||
(iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
|
||||
!btree_node_locked(iter, level)));
|
||||
|
||||
if (!IS_ERR_OR_NULL(ret)) {
|
||||
struct btree *n1 = ret, *n2 = b;
|
||||
|
||||
if (sib != btree_prev_sib)
|
||||
swap(n1, n2);
|
||||
|
||||
BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
|
||||
n1->key.k.p),
|
||||
n2->data->min_key));
|
||||
}
|
||||
|
||||
return ret;
|
||||
out_upgrade:
|
||||
if (may_drop_locks)
|
||||
bch2_btree_iter_upgrade(iter, level + 2, true);
|
||||
ret = ERR_PTR(-EINTR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
|
||||
const struct bkey_i *k, unsigned level)
|
||||
{
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
struct btree *b;
|
||||
|
||||
BUG_ON(!btree_node_locked(iter, level + 1));
|
||||
BUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
|
||||
rcu_read_lock();
|
||||
b = btree_cache_find(bc, k);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (b)
|
||||
return;
|
||||
|
||||
bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
|
||||
}
|
||||
|
||||
int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
|
||||
char *buf, size_t len)
|
||||
{
|
||||
const struct bkey_format *f = &b->format;
|
||||
struct bset_stats stats;
|
||||
char ptrs[100];
|
||||
|
||||
memset(&stats, 0, sizeof(stats));
|
||||
|
||||
bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
|
||||
bkey_i_to_s_c(&b->key));
|
||||
bch2_btree_keys_stats(b, &stats);
|
||||
|
||||
return scnprintf(buf, len,
|
||||
"l %u %llu:%llu - %llu:%llu:\n"
|
||||
" ptrs: %s\n"
|
||||
" format: u64s %u fields %u %u %u %u %u\n"
|
||||
" unpack fn len: %u\n"
|
||||
" bytes used %zu/%zu (%zu%% full)\n"
|
||||
" sib u64s: %u, %u (merge threshold %zu)\n"
|
||||
" nr packed keys %u\n"
|
||||
" nr unpacked keys %u\n"
|
||||
" floats %zu\n"
|
||||
" failed unpacked %zu\n"
|
||||
" failed prev %zu\n"
|
||||
" failed overflow %zu\n",
|
||||
b->level,
|
||||
b->data->min_key.inode,
|
||||
b->data->min_key.offset,
|
||||
b->data->max_key.inode,
|
||||
b->data->max_key.offset,
|
||||
ptrs,
|
||||
f->key_u64s,
|
||||
f->bits_per_field[0],
|
||||
f->bits_per_field[1],
|
||||
f->bits_per_field[2],
|
||||
f->bits_per_field[3],
|
||||
f->bits_per_field[4],
|
||||
b->unpack_fn_len,
|
||||
b->nr.live_u64s * sizeof(u64),
|
||||
btree_bytes(c) - sizeof(struct btree_node),
|
||||
b->nr.live_u64s * 100 / btree_max_u64s(c),
|
||||
b->sib_u64s[0],
|
||||
b->sib_u64s[1],
|
||||
BTREE_FOREGROUND_MERGE_THRESHOLD(c),
|
||||
b->nr.packed_keys,
|
||||
b->nr.unpacked_keys,
|
||||
stats.floats,
|
||||
stats.failed_unpacked,
|
||||
stats.failed_prev,
|
||||
stats.failed_overflow);
|
||||
}
|
91
fs/bcachefs/btree_cache.h
Normal file
91
fs/bcachefs/btree_cache.h
Normal file
@ -0,0 +1,91 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_CACHE_H
|
||||
#define _BCACHEFS_BTREE_CACHE_H
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_types.h"
|
||||
#include "extents.h"
|
||||
|
||||
struct btree_iter;
|
||||
|
||||
extern const char * const bch2_btree_ids[];
|
||||
|
||||
void bch2_recalc_btree_reserve(struct bch_fs *);
|
||||
|
||||
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
|
||||
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
|
||||
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
|
||||
unsigned, enum btree_id);
|
||||
|
||||
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
|
||||
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
|
||||
|
||||
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
|
||||
|
||||
struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
|
||||
const struct bkey_i *, unsigned,
|
||||
enum six_lock_type, bool);
|
||||
|
||||
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
|
||||
struct btree *, bool,
|
||||
enum btree_node_sibling);
|
||||
|
||||
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
|
||||
const struct bkey_i *, unsigned);
|
||||
|
||||
void bch2_fs_btree_cache_exit(struct bch_fs *);
|
||||
int bch2_fs_btree_cache_init(struct bch_fs *);
|
||||
void bch2_fs_btree_cache_init_early(struct btree_cache *);
|
||||
|
||||
#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
|
||||
|
||||
/* is btree node in hash table? */
|
||||
static inline bool btree_node_hashed(struct btree *b)
|
||||
{
|
||||
return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
|
||||
}
|
||||
|
||||
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
|
||||
for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
|
||||
&(_c)->btree_cache.table), \
|
||||
_iter = 0; _iter < (_tbl)->size; _iter++) \
|
||||
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
|
||||
|
||||
static inline size_t btree_bytes(struct bch_fs *c)
|
||||
{
|
||||
return c->opts.btree_node_size << 9;
|
||||
}
|
||||
|
||||
static inline size_t btree_max_u64s(struct bch_fs *c)
|
||||
{
|
||||
return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
|
||||
}
|
||||
|
||||
static inline size_t btree_page_order(struct bch_fs *c)
|
||||
{
|
||||
return get_order(btree_bytes(c));
|
||||
}
|
||||
|
||||
static inline size_t btree_pages(struct bch_fs *c)
|
||||
{
|
||||
return 1 << btree_page_order(c);
|
||||
}
|
||||
|
||||
static inline unsigned btree_blocks(struct bch_fs *c)
|
||||
{
|
||||
return c->opts.btree_node_size >> c->block_bits;
|
||||
}
|
||||
|
||||
#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
|
||||
|
||||
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
|
||||
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
|
||||
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
|
||||
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
|
||||
|
||||
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b)
|
||||
|
||||
int bch2_print_btree_node(struct bch_fs *, struct btree *,
|
||||
char *, size_t);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_CACHE_H */
|
1099
fs/bcachefs/btree_gc.c
Normal file
1099
fs/bcachefs/btree_gc.c
Normal file
File diff suppressed because it is too large
Load Diff
113
fs/bcachefs/btree_gc.h
Normal file
113
fs/bcachefs/btree_gc.h
Normal file
@ -0,0 +1,113 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_GC_H
|
||||
#define _BCACHEFS_BTREE_GC_H
|
||||
|
||||
#include "btree_types.h"
|
||||
|
||||
enum bkey_type;
|
||||
|
||||
void bch2_coalesce(struct bch_fs *);
|
||||
void bch2_gc(struct bch_fs *);
|
||||
void bch2_gc_thread_stop(struct bch_fs *);
|
||||
int bch2_gc_thread_start(struct bch_fs *);
|
||||
int bch2_initial_gc(struct bch_fs *, struct list_head *);
|
||||
u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
|
||||
int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
|
||||
struct bkey_s_c);
|
||||
void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
|
||||
|
||||
/*
|
||||
* For concurrent mark and sweep (with other index updates), we define a total
|
||||
* ordering of _all_ references GC walks:
|
||||
*
|
||||
* Note that some references will have the same GC position as others - e.g.
|
||||
* everything within the same btree node; in those cases we're relying on
|
||||
* whatever locking exists for where those references live, i.e. the write lock
|
||||
* on a btree node.
|
||||
*
|
||||
* That locking is also required to ensure GC doesn't pass the updater in
|
||||
* between the updater adding/removing the reference and updating the GC marks;
|
||||
* without that, we would at best double count sometimes.
|
||||
*
|
||||
* That part is important - whenever calling bch2_mark_pointers(), a lock _must_
|
||||
* be held that prevents GC from passing the position the updater is at.
|
||||
*
|
||||
* (What about the start of gc, when we're clearing all the marks? GC clears the
|
||||
* mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
|
||||
* position inside its cmpxchg loop, so crap magically works).
|
||||
*/
|
||||
|
||||
/* Position of (the start of) a gc phase: */
|
||||
static inline struct gc_pos gc_phase(enum gc_phase phase)
|
||||
{
|
||||
return (struct gc_pos) {
|
||||
.phase = phase,
|
||||
.pos = POS_MIN,
|
||||
.level = 0,
|
||||
};
|
||||
}
|
||||
|
||||
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
|
||||
{
|
||||
if (l.phase != r.phase)
|
||||
return l.phase < r.phase ? -1 : 1;
|
||||
if (bkey_cmp(l.pos, r.pos))
|
||||
return bkey_cmp(l.pos, r.pos);
|
||||
if (l.level != r.level)
|
||||
return l.level < r.level ? -1 : 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline struct gc_pos gc_pos_btree(enum btree_id id,
|
||||
struct bpos pos, unsigned level)
|
||||
{
|
||||
return (struct gc_pos) {
|
||||
.phase = GC_PHASE_BTREE_EXTENTS + id,
|
||||
.pos = pos,
|
||||
.level = level,
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* GC position of the pointers within a btree node: note, _not_ for &b->key
|
||||
* itself, that lives in the parent node:
|
||||
*/
|
||||
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
|
||||
{
|
||||
return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
|
||||
}
|
||||
|
||||
/*
|
||||
* GC position of the pointer to a btree root: we don't use
|
||||
* gc_pos_pointer_to_btree_node() here to avoid a potential race with
|
||||
* btree_split() increasing the tree depth - the new root will have level > the
|
||||
* old root and thus have a greater gc position than the old root, but that
|
||||
* would be incorrect since once gc has marked the root it's not coming back.
|
||||
*/
|
||||
static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
|
||||
{
|
||||
return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
|
||||
}
|
||||
|
||||
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
return (struct gc_pos) {
|
||||
.phase = GC_PHASE_ALLOC,
|
||||
.pos = POS(ob ? ob - c->open_buckets : 0, 0),
|
||||
};
|
||||
}
|
||||
|
||||
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
|
||||
{
|
||||
unsigned seq;
|
||||
bool ret;
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&c->gc_pos_lock);
|
||||
ret = gc_pos_cmp(c->gc_pos, pos) < 0;
|
||||
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_GC_H */
|
2095
fs/bcachefs/btree_io.c
Normal file
2095
fs/bcachefs/btree_io.c
Normal file
File diff suppressed because it is too large
Load Diff
197
fs/bcachefs/btree_io.h
Normal file
197
fs/bcachefs/btree_io.h
Normal file
@ -0,0 +1,197 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_IO_H
|
||||
#define _BCACHEFS_BTREE_IO_H
|
||||
|
||||
#include "bset.h"
|
||||
#include "extents.h"
|
||||
#include "io_types.h"
|
||||
|
||||
struct bch_fs;
|
||||
struct btree_write;
|
||||
struct btree;
|
||||
struct btree_iter;
|
||||
|
||||
struct btree_read_bio {
|
||||
struct bch_fs *c;
|
||||
u64 start_time;
|
||||
unsigned have_ioref:1;
|
||||
struct extent_pick_ptr pick;
|
||||
struct work_struct work;
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
struct btree_write_bio {
|
||||
void *data;
|
||||
struct work_struct work;
|
||||
struct bch_write_bio wbio;
|
||||
};
|
||||
|
||||
static inline void btree_node_io_unlock(struct btree *b)
|
||||
{
|
||||
EBUG_ON(!btree_node_write_in_flight(b));
|
||||
clear_btree_node_write_in_flight(b);
|
||||
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
|
||||
}
|
||||
|
||||
static inline void btree_node_io_lock(struct btree *b)
|
||||
{
|
||||
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
|
||||
static inline void btree_node_wait_on_io(struct btree *b)
|
||||
{
|
||||
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
}
|
||||
|
||||
static inline bool btree_node_may_write(struct btree *b)
|
||||
{
|
||||
return list_empty_careful(&b->write_blocked) &&
|
||||
!b->will_make_reachable;
|
||||
}
|
||||
|
||||
enum compact_mode {
|
||||
COMPACT_LAZY,
|
||||
COMPACT_WRITTEN,
|
||||
COMPACT_WRITTEN_NO_WRITE_LOCK,
|
||||
};
|
||||
|
||||
bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
|
||||
|
||||
static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
|
||||
{
|
||||
unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
|
||||
unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
|
||||
|
||||
return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
|
||||
}
|
||||
|
||||
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct bset_tree *t;
|
||||
|
||||
for_each_bset(b, t)
|
||||
if (should_compact_bset_lazy(b, t))
|
||||
return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
|
||||
|
||||
void bch2_btree_build_aux_trees(struct btree *);
|
||||
void bch2_btree_init_next(struct bch_fs *, struct btree *,
|
||||
struct btree_iter *);
|
||||
|
||||
int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
|
||||
void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
|
||||
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
|
||||
const struct bkey_i *, unsigned);
|
||||
|
||||
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
|
||||
struct btree_write *);
|
||||
void bch2_btree_write_error_work(struct work_struct *);
|
||||
|
||||
void __bch2_btree_node_write(struct bch_fs *, struct btree *,
|
||||
enum six_lock_type);
|
||||
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
|
||||
|
||||
void bch2_btree_node_write(struct bch_fs *, struct btree *,
|
||||
enum six_lock_type);
|
||||
|
||||
/*
|
||||
* btree_node_dirty() can be cleared with only a read lock,
|
||||
* and for bch2_btree_node_write_cond() we want to set need_write iff it's
|
||||
* still dirty:
|
||||
*/
|
||||
static inline void set_btree_node_need_write_if_dirty(struct btree *b)
|
||||
{
|
||||
unsigned long old, new, v = READ_ONCE(b->flags);
|
||||
|
||||
do {
|
||||
old = new = v;
|
||||
|
||||
if (!(old & (1 << BTREE_NODE_dirty)))
|
||||
return;
|
||||
|
||||
new |= (1 << BTREE_NODE_need_write);
|
||||
} while ((v = cmpxchg(&b->flags, old, new)) != old);
|
||||
}
|
||||
|
||||
#define bch2_btree_node_write_cond(_c, _b, cond) \
|
||||
do { \
|
||||
while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
|
||||
if (!btree_node_may_write(_b)) { \
|
||||
set_btree_node_need_write_if_dirty(_b); \
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
if (!btree_node_write_in_flight(_b)) { \
|
||||
bch2_btree_node_write(_c, _b, SIX_LOCK_read); \
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
six_unlock_read(&(_b)->lock); \
|
||||
btree_node_wait_on_io(_b); \
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void bch2_btree_flush_all_reads(struct bch_fs *);
|
||||
void bch2_btree_flush_all_writes(struct bch_fs *);
|
||||
void bch2_btree_verify_flushed(struct bch_fs *);
|
||||
ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
|
||||
|
||||
/* Sorting */
|
||||
|
||||
struct btree_node_iter_large {
|
||||
u8 is_extents;
|
||||
u16 used;
|
||||
|
||||
struct btree_node_iter_set data[MAX_BSETS];
|
||||
};
|
||||
|
||||
static inline void
|
||||
__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
|
||||
bool is_extents)
|
||||
{
|
||||
iter->used = 0;
|
||||
iter->is_extents = is_extents;
|
||||
}
|
||||
|
||||
void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
|
||||
struct btree *);
|
||||
|
||||
void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
|
||||
struct btree *,
|
||||
const struct bkey_packed *,
|
||||
const struct bkey_packed *);
|
||||
|
||||
static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
|
||||
{
|
||||
return !iter->used;
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
|
||||
struct btree *b)
|
||||
{
|
||||
return bch2_btree_node_iter_large_end(iter)
|
||||
? NULL
|
||||
: __btree_node_offset_to_key(b, iter->data->k);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
|
||||
|
||||
if (ret)
|
||||
bch2_btree_node_iter_large_advance(iter, b);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_IO_H */
|
1844
fs/bcachefs/btree_iter.c
Normal file
1844
fs/bcachefs/btree_iter.c
Normal file
File diff suppressed because it is too large
Load Diff
314
fs/bcachefs/btree_iter.h
Normal file
314
fs/bcachefs/btree_iter.h
Normal file
@ -0,0 +1,314 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_ITER_H
|
||||
#define _BCACHEFS_BTREE_ITER_H
|
||||
|
||||
#include "btree_types.h"
|
||||
|
||||
static inline void btree_iter_set_dirty(struct btree_iter *iter,
|
||||
enum btree_iter_uptodate u)
|
||||
{
|
||||
iter->uptodate = max_t(unsigned, iter->uptodate, u);
|
||||
}
|
||||
|
||||
static inline struct btree *btree_iter_node(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
|
||||
}
|
||||
|
||||
static inline struct btree *btree_node_parent(struct btree_iter *iter,
|
||||
struct btree *b)
|
||||
{
|
||||
return btree_iter_node(iter, b->level + 1);
|
||||
}
|
||||
|
||||
static inline bool btree_iter_linked(const struct btree_iter *iter)
|
||||
{
|
||||
return iter->next != iter;
|
||||
}
|
||||
|
||||
static inline bool __iter_has_node(const struct btree_iter *iter,
|
||||
const struct btree *b)
|
||||
{
|
||||
/*
|
||||
* We don't compare the low bits of the lock sequence numbers because
|
||||
* @iter might have taken a write lock on @b, and we don't want to skip
|
||||
* the linked iterator if the sequence numbers were equal before taking
|
||||
* that write lock. The lock sequence number is incremented by taking
|
||||
* and releasing write locks and is even when unlocked:
|
||||
*/
|
||||
|
||||
return iter->l[b->level].b == b &&
|
||||
iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
|
||||
}
|
||||
|
||||
static inline struct btree_iter *
|
||||
__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
|
||||
{
|
||||
return linked->next != iter ? linked->next : NULL;
|
||||
}
|
||||
|
||||
static inline struct btree_iter *
|
||||
__next_iter_with_node(struct btree_iter *iter, struct btree *b,
|
||||
struct btree_iter *linked)
|
||||
{
|
||||
while (linked && !__iter_has_node(linked, b))
|
||||
linked = __next_linked_iter(iter, linked);
|
||||
|
||||
return linked;
|
||||
}
|
||||
|
||||
/**
|
||||
* for_each_btree_iter - iterate over all iterators linked with @_iter,
|
||||
* including @_iter
|
||||
*/
|
||||
#define for_each_btree_iter(_iter, _linked) \
|
||||
for ((_linked) = (_iter); (_linked); \
|
||||
(_linked) = __next_linked_iter(_iter, _linked))
|
||||
|
||||
/**
|
||||
* for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
|
||||
* that also point to @_b
|
||||
*
|
||||
* @_b is assumed to be locked by @_iter
|
||||
*
|
||||
* Filters out iterators that don't have a valid btree_node iterator for @_b -
|
||||
* i.e. iterators for which bch2_btree_node_relock() would not succeed.
|
||||
*/
|
||||
#define for_each_btree_iter_with_node(_iter, _b, _linked) \
|
||||
for ((_linked) = (_iter); \
|
||||
((_linked) = __next_iter_with_node(_iter, _b, _linked)); \
|
||||
(_linked) = __next_linked_iter(_iter, _linked))
|
||||
|
||||
/**
|
||||
* for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
|
||||
* _not_ including @_iter
|
||||
*/
|
||||
#define for_each_linked_btree_iter(_iter, _linked) \
|
||||
for ((_linked) = (_iter)->next; \
|
||||
(_linked) != (_iter); \
|
||||
(_linked) = (_linked)->next)
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
|
||||
void bch2_btree_iter_verify_locks(struct btree_iter *);
|
||||
#else
|
||||
static inline void bch2_btree_iter_verify(struct btree_iter *iter,
|
||||
struct btree *b) {}
|
||||
static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
|
||||
#endif
|
||||
|
||||
void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
|
||||
struct btree_node_iter *, struct bset_tree *,
|
||||
struct bkey_packed *, unsigned, unsigned);
|
||||
|
||||
int bch2_btree_iter_unlock(struct btree_iter *);
|
||||
|
||||
bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
|
||||
bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
|
||||
|
||||
static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
|
||||
unsigned new_locks_want,
|
||||
bool may_drop_locks)
|
||||
{
|
||||
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
|
||||
|
||||
return iter->locks_want < new_locks_want
|
||||
? (may_drop_locks
|
||||
? __bch2_btree_iter_upgrade(iter, new_locks_want)
|
||||
: __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
|
||||
: iter->uptodate <= BTREE_ITER_NEED_PEEK;
|
||||
}
|
||||
|
||||
void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
|
||||
|
||||
static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
|
||||
{
|
||||
if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
|
||||
__bch2_btree_iter_downgrade(iter, 0);
|
||||
}
|
||||
|
||||
void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
|
||||
void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
|
||||
|
||||
void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
|
||||
|
||||
int __must_check bch2_btree_iter_traverse(struct btree_iter *);
|
||||
|
||||
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
|
||||
struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
|
||||
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
|
||||
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
|
||||
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
|
||||
|
||||
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
|
||||
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
|
||||
|
||||
void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
|
||||
enum btree_id, struct bpos,
|
||||
unsigned , unsigned, unsigned);
|
||||
|
||||
static inline void bch2_btree_iter_init(struct btree_iter *iter,
|
||||
struct bch_fs *c, enum btree_id btree_id,
|
||||
struct bpos pos, unsigned flags)
|
||||
{
|
||||
__bch2_btree_iter_init(iter, c, btree_id, pos,
|
||||
flags & BTREE_ITER_INTENT ? 1 : 0, 0,
|
||||
(btree_id == BTREE_ID_EXTENTS
|
||||
? BTREE_ITER_IS_EXTENTS : 0)|flags);
|
||||
}
|
||||
|
||||
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
|
||||
void bch2_btree_iter_unlink(struct btree_iter *);
|
||||
void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
|
||||
|
||||
static inline struct bpos btree_type_successor(enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
if (id == BTREE_ID_INODES) {
|
||||
pos.inode++;
|
||||
pos.offset = 0;
|
||||
} else if (id != BTREE_ID_EXTENTS) {
|
||||
pos = bkey_successor(pos);
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline struct bpos btree_type_predecessor(enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
if (id == BTREE_ID_INODES) {
|
||||
--pos.inode;
|
||||
pos.offset = 0;
|
||||
} else /* if (id != BTREE_ID_EXTENTS) */ {
|
||||
pos = bkey_predecessor(pos);
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline int __btree_iter_cmp(enum btree_id id,
|
||||
struct bpos pos,
|
||||
const struct btree_iter *r)
|
||||
{
|
||||
if (id != r->btree_id)
|
||||
return id < r->btree_id ? -1 : 1;
|
||||
return bkey_cmp(pos, r->pos);
|
||||
}
|
||||
|
||||
static inline int btree_iter_cmp(const struct btree_iter *l,
|
||||
const struct btree_iter *r)
|
||||
{
|
||||
return __btree_iter_cmp(l->btree_id, l->pos, r);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlocks before scheduling
|
||||
* Note: does not revalidate iterator
|
||||
*/
|
||||
static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
|
||||
{
|
||||
if (need_resched()) {
|
||||
bch2_btree_iter_unlock(iter);
|
||||
schedule();
|
||||
} else if (race_fault()) {
|
||||
bch2_btree_iter_unlock(iter);
|
||||
}
|
||||
}
|
||||
|
||||
#define __for_each_btree_node(_iter, _c, _btree_id, _start, \
|
||||
_locks_want, _depth, _flags, _b) \
|
||||
for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
|
||||
_locks_want, _depth, \
|
||||
_flags|BTREE_ITER_NODES), \
|
||||
_b = bch2_btree_iter_peek_node(_iter); \
|
||||
(_b); \
|
||||
(_b) = bch2_btree_iter_next_node(_iter, _depth))
|
||||
|
||||
#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \
|
||||
__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
|
||||
|
||||
static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
|
||||
unsigned flags)
|
||||
{
|
||||
return flags & BTREE_ITER_SLOTS
|
||||
? bch2_btree_iter_peek_slot(iter)
|
||||
: bch2_btree_iter_peek(iter);
|
||||
}
|
||||
|
||||
static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
|
||||
unsigned flags)
|
||||
{
|
||||
bch2_btree_iter_cond_resched(iter);
|
||||
|
||||
return flags & BTREE_ITER_SLOTS
|
||||
? bch2_btree_iter_next_slot(iter)
|
||||
: bch2_btree_iter_next(iter);
|
||||
}
|
||||
|
||||
#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \
|
||||
for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \
|
||||
(_start), (_flags)), \
|
||||
(_k) = __bch2_btree_iter_peek(_iter, _flags); \
|
||||
!IS_ERR_OR_NULL((_k).k); \
|
||||
(_k) = __bch2_btree_iter_next(_iter, _flags))
|
||||
|
||||
#define for_each_btree_key_continue(_iter, _flags, _k) \
|
||||
for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
|
||||
!IS_ERR_OR_NULL((_k).k); \
|
||||
(_k) = __bch2_btree_iter_next(_iter, _flags))
|
||||
|
||||
static inline int btree_iter_err(struct bkey_s_c k)
|
||||
{
|
||||
return PTR_ERR_OR_ZERO(k.k);
|
||||
}
|
||||
|
||||
/* new multiple iterator interface: */
|
||||
|
||||
int bch2_trans_preload_iters(struct btree_trans *);
|
||||
void bch2_trans_iter_free(struct btree_trans *,
|
||||
struct btree_iter *);
|
||||
|
||||
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
|
||||
struct bpos, unsigned, u64);
|
||||
struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
|
||||
struct btree_iter *, u64);
|
||||
|
||||
static __always_inline u64 __btree_iter_id(void)
|
||||
{
|
||||
u64 ret = 0;
|
||||
|
||||
ret <<= 32;
|
||||
ret |= _RET_IP_ & U32_MAX;
|
||||
ret <<= 32;
|
||||
ret |= _THIS_IP_ & U32_MAX;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __always_inline struct btree_iter *
|
||||
bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
|
||||
struct bpos pos, unsigned flags)
|
||||
{
|
||||
return __bch2_trans_get_iter(trans, btree_id, pos, flags,
|
||||
__btree_iter_id());
|
||||
}
|
||||
|
||||
static __always_inline struct btree_iter *
|
||||
bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
|
||||
{
|
||||
|
||||
return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
|
||||
}
|
||||
|
||||
void *bch2_trans_kmalloc(struct btree_trans *, size_t);
|
||||
int bch2_trans_unlock(struct btree_trans *);
|
||||
void bch2_trans_begin(struct btree_trans *);
|
||||
void bch2_trans_init(struct btree_trans *, struct bch_fs *);
|
||||
int bch2_trans_exit(struct btree_trans *);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_ITER_H */
|
196
fs/bcachefs/btree_locking.h
Normal file
196
fs/bcachefs/btree_locking.h
Normal file
@ -0,0 +1,196 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_LOCKING_H
|
||||
#define _BCACHEFS_BTREE_LOCKING_H
|
||||
|
||||
/*
|
||||
* Only for internal btree use:
|
||||
*
|
||||
* The btree iterator tracks what locks it wants to take, and what locks it
|
||||
* currently has - here we have wrappers for locking/unlocking btree nodes and
|
||||
* updating the iterator state
|
||||
*/
|
||||
|
||||
#include "btree_iter.h"
|
||||
#include "btree_io.h"
|
||||
#include "six.h"
|
||||
|
||||
/* matches six lock types */
|
||||
enum btree_node_locked_type {
|
||||
BTREE_NODE_UNLOCKED = -1,
|
||||
BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
|
||||
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
|
||||
};
|
||||
|
||||
static inline int btree_node_locked_type(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
/*
|
||||
* We're relying on the fact that if nodes_intent_locked is set
|
||||
* nodes_locked must be set as well, so that we can compute without
|
||||
* branches:
|
||||
*/
|
||||
return BTREE_NODE_UNLOCKED +
|
||||
((iter->nodes_locked >> level) & 1) +
|
||||
((iter->nodes_intent_locked >> level) & 1);
|
||||
}
|
||||
|
||||
static inline bool btree_node_intent_locked(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
|
||||
}
|
||||
|
||||
static inline bool btree_node_read_locked(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
|
||||
}
|
||||
|
||||
static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
|
||||
{
|
||||
return iter->nodes_locked & (1 << level);
|
||||
}
|
||||
|
||||
static inline void mark_btree_node_unlocked(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
iter->nodes_locked &= ~(1 << level);
|
||||
iter->nodes_intent_locked &= ~(1 << level);
|
||||
}
|
||||
|
||||
static inline void mark_btree_node_locked(struct btree_iter *iter,
|
||||
unsigned level,
|
||||
enum six_lock_type type)
|
||||
{
|
||||
/* relying on this to avoid a branch */
|
||||
BUILD_BUG_ON(SIX_LOCK_read != 0);
|
||||
BUILD_BUG_ON(SIX_LOCK_intent != 1);
|
||||
|
||||
iter->nodes_locked |= 1 << level;
|
||||
iter->nodes_intent_locked |= type << level;
|
||||
}
|
||||
|
||||
static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
|
||||
}
|
||||
|
||||
static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
|
||||
{
|
||||
return level < iter->locks_want
|
||||
? SIX_LOCK_intent
|
||||
: SIX_LOCK_read;
|
||||
}
|
||||
|
||||
static inline enum btree_node_locked_type
|
||||
btree_lock_want(struct btree_iter *iter, int level)
|
||||
{
|
||||
if (level < iter->level)
|
||||
return BTREE_NODE_UNLOCKED;
|
||||
if (level < iter->locks_want)
|
||||
return BTREE_NODE_INTENT_LOCKED;
|
||||
if (level == iter->level)
|
||||
return BTREE_NODE_READ_LOCKED;
|
||||
return BTREE_NODE_UNLOCKED;
|
||||
}
|
||||
|
||||
static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
|
||||
{
|
||||
int lock_type = btree_node_locked_type(iter, level);
|
||||
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
|
||||
if (lock_type != BTREE_NODE_UNLOCKED)
|
||||
six_unlock_type(&iter->l[level].b->lock, lock_type);
|
||||
mark_btree_node_unlocked(iter, level);
|
||||
}
|
||||
|
||||
static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
|
||||
{
|
||||
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
|
||||
|
||||
while (iter->nodes_locked)
|
||||
btree_node_unlock(iter, __ffs(iter->nodes_locked));
|
||||
}
|
||||
|
||||
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case SIX_LOCK_read:
|
||||
return BCH_TIME_btree_lock_contended_read;
|
||||
case SIX_LOCK_intent:
|
||||
return BCH_TIME_btree_lock_contended_intent;
|
||||
case SIX_LOCK_write:
|
||||
return BCH_TIME_btree_lock_contended_write;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* wrapper around six locks that just traces lock contended time
|
||||
*/
|
||||
static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
|
||||
enum six_lock_type type)
|
||||
{
|
||||
u64 start_time = local_clock();
|
||||
|
||||
six_lock_type(&b->lock, type, NULL, NULL);
|
||||
bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
|
||||
}
|
||||
|
||||
static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
|
||||
enum six_lock_type type)
|
||||
{
|
||||
if (!six_trylock_type(&b->lock, type))
|
||||
__btree_node_lock_type(c, b, type);
|
||||
}
|
||||
|
||||
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
|
||||
struct btree_iter *, enum six_lock_type, bool);
|
||||
|
||||
static inline bool btree_node_lock(struct btree *b, struct bpos pos,
|
||||
unsigned level,
|
||||
struct btree_iter *iter,
|
||||
enum six_lock_type type,
|
||||
bool may_drop_locks)
|
||||
{
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
|
||||
return likely(six_trylock_type(&b->lock, type)) ||
|
||||
__bch2_btree_node_lock(b, pos, level, iter,
|
||||
type, may_drop_locks);
|
||||
}
|
||||
|
||||
bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
|
||||
|
||||
static inline bool bch2_btree_node_relock(struct btree_iter *iter,
|
||||
unsigned level)
|
||||
{
|
||||
EBUG_ON(btree_node_locked(iter, level) &&
|
||||
btree_node_locked_type(iter, level) !=
|
||||
__btree_lock_want(iter, level));
|
||||
|
||||
return likely(btree_node_locked(iter, level)) ||
|
||||
__bch2_btree_node_relock(iter, level);
|
||||
}
|
||||
|
||||
bool bch2_btree_iter_relock(struct btree_iter *);
|
||||
|
||||
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
|
||||
|
||||
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
|
||||
|
||||
static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
||||
{
|
||||
EBUG_ON(iter->l[b->level].b != b);
|
||||
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
|
||||
|
||||
if (!six_trylock_write(&b->lock))
|
||||
__bch2_btree_node_lock_write(b, iter);
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_LOCKING_H */
|
||||
|
||||
|
479
fs/bcachefs/btree_types.h
Normal file
479
fs/bcachefs/btree_types.h
Normal file
@ -0,0 +1,479 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_TYPES_H
|
||||
#define _BCACHEFS_BTREE_TYPES_H
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/rhashtable.h>
|
||||
|
||||
#include "bkey_methods.h"
|
||||
#include "journal_types.h"
|
||||
#include "six.h"
|
||||
|
||||
struct open_bucket;
|
||||
struct btree_update;
|
||||
|
||||
#define MAX_BSETS 3U
|
||||
|
||||
struct btree_nr_keys {
|
||||
|
||||
/*
|
||||
* Amount of live metadata (i.e. size of node after a compaction) in
|
||||
* units of u64s
|
||||
*/
|
||||
u16 live_u64s;
|
||||
u16 bset_u64s[MAX_BSETS];
|
||||
|
||||
/* live keys only: */
|
||||
u16 packed_keys;
|
||||
u16 unpacked_keys;
|
||||
};
|
||||
|
||||
struct bset_tree {
|
||||
/*
|
||||
* We construct a binary tree in an array as if the array
|
||||
* started at 1, so that things line up on the same cachelines
|
||||
* better: see comments in bset.c at cacheline_to_bkey() for
|
||||
* details
|
||||
*/
|
||||
|
||||
/* size of the binary tree and prev array */
|
||||
u16 size;
|
||||
|
||||
/* function of size - precalculated for to_inorder() */
|
||||
u16 extra;
|
||||
|
||||
u16 data_offset;
|
||||
u16 aux_data_offset;
|
||||
u16 end_offset;
|
||||
|
||||
struct bpos max_key;
|
||||
};
|
||||
|
||||
struct btree_write {
|
||||
struct journal_entry_pin journal;
|
||||
struct closure_waitlist wait;
|
||||
};
|
||||
|
||||
struct btree_ob_ref {
|
||||
u8 nr;
|
||||
u8 refs[BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
struct btree_alloc {
|
||||
struct btree_ob_ref ob;
|
||||
BKEY_PADDED(k);
|
||||
};
|
||||
|
||||
struct btree {
|
||||
/* Hottest entries first */
|
||||
struct rhash_head hash;
|
||||
|
||||
/* Key/pointer for this btree node */
|
||||
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
||||
|
||||
struct six_lock lock;
|
||||
|
||||
unsigned long flags;
|
||||
u16 written;
|
||||
u8 level;
|
||||
u8 btree_id;
|
||||
u8 nsets;
|
||||
u8 nr_key_bits;
|
||||
|
||||
struct bkey_format format;
|
||||
|
||||
struct btree_node *data;
|
||||
void *aux_data;
|
||||
|
||||
/*
|
||||
* Sets of sorted keys - the real btree node - plus a binary search tree
|
||||
*
|
||||
* set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
|
||||
* to the memory we have allocated for this btree node. Additionally,
|
||||
* set[0]->data points to the entire btree node as it exists on disk.
|
||||
*/
|
||||
struct bset_tree set[MAX_BSETS];
|
||||
|
||||
struct btree_nr_keys nr;
|
||||
u16 sib_u64s[2];
|
||||
u16 whiteout_u64s;
|
||||
u16 uncompacted_whiteout_u64s;
|
||||
u8 page_order;
|
||||
u8 unpack_fn_len;
|
||||
|
||||
/*
|
||||
* XXX: add a delete sequence number, so when bch2_btree_node_relock()
|
||||
* fails because the lock sequence number has changed - i.e. the
|
||||
* contents were modified - we can still relock the node if it's still
|
||||
* the one we want, without redoing the traversal
|
||||
*/
|
||||
|
||||
/*
|
||||
* For asynchronous splits/interior node updates:
|
||||
* When we do a split, we allocate new child nodes and update the parent
|
||||
* node to point to them: we update the parent in memory immediately,
|
||||
* but then we must wait until the children have been written out before
|
||||
* the update to the parent can be written - this is a list of the
|
||||
* btree_updates that are blocking this node from being
|
||||
* written:
|
||||
*/
|
||||
struct list_head write_blocked;
|
||||
|
||||
/*
|
||||
* Also for asynchronous splits/interior node updates:
|
||||
* If a btree node isn't reachable yet, we don't want to kick off
|
||||
* another write - because that write also won't yet be reachable and
|
||||
* marking it as completed before it's reachable would be incorrect:
|
||||
*/
|
||||
unsigned long will_make_reachable;
|
||||
|
||||
struct btree_ob_ref ob;
|
||||
|
||||
/* lru list */
|
||||
struct list_head list;
|
||||
|
||||
struct btree_write writes[2];
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
bool *expensive_debug_checks;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct btree_cache {
|
||||
struct rhashtable table;
|
||||
bool table_init_done;
|
||||
/*
|
||||
* We never free a struct btree, except on shutdown - we just put it on
|
||||
* the btree_cache_freed list and reuse it later. This simplifies the
|
||||
* code, and it doesn't cost us much memory as the memory usage is
|
||||
* dominated by buffers that hold the actual btree node data and those
|
||||
* can be freed - and the number of struct btrees allocated is
|
||||
* effectively bounded.
|
||||
*
|
||||
* btree_cache_freeable effectively is a small cache - we use it because
|
||||
* high order page allocations can be rather expensive, and it's quite
|
||||
* common to delete and allocate btree nodes in quick succession. It
|
||||
* should never grow past ~2-3 nodes in practice.
|
||||
*/
|
||||
struct mutex lock;
|
||||
struct list_head live;
|
||||
struct list_head freeable;
|
||||
struct list_head freed;
|
||||
|
||||
/* Number of elements in live + freeable lists */
|
||||
unsigned used;
|
||||
unsigned reserve;
|
||||
struct shrinker shrink;
|
||||
|
||||
/*
|
||||
* If we need to allocate memory for a new btree node and that
|
||||
* allocation fails, we can cannibalize another node in the btree cache
|
||||
* to satisfy the allocation - lock to guarantee only one thread does
|
||||
* this at a time:
|
||||
*/
|
||||
struct task_struct *alloc_lock;
|
||||
struct closure_waitlist alloc_wait;
|
||||
};
|
||||
|
||||
struct btree_node_iter {
|
||||
u8 is_extents;
|
||||
|
||||
struct btree_node_iter_set {
|
||||
u16 k, end;
|
||||
} data[MAX_BSETS];
|
||||
};
|
||||
|
||||
enum btree_iter_type {
|
||||
BTREE_ITER_KEYS,
|
||||
BTREE_ITER_SLOTS,
|
||||
BTREE_ITER_NODES,
|
||||
};
|
||||
|
||||
#define BTREE_ITER_TYPE ((1 << 2) - 1)
|
||||
|
||||
#define BTREE_ITER_INTENT (1 << 2)
|
||||
#define BTREE_ITER_PREFETCH (1 << 3)
|
||||
/*
|
||||
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
|
||||
* @pos or the first key strictly greater than @pos
|
||||
*/
|
||||
#define BTREE_ITER_IS_EXTENTS (1 << 4)
|
||||
/*
|
||||
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
|
||||
*/
|
||||
#define BTREE_ITER_AT_END_OF_LEAF (1 << 5)
|
||||
#define BTREE_ITER_ERROR (1 << 6)
|
||||
|
||||
enum btree_iter_uptodate {
|
||||
BTREE_ITER_UPTODATE = 0,
|
||||
BTREE_ITER_NEED_PEEK = 1,
|
||||
BTREE_ITER_NEED_RELOCK = 2,
|
||||
BTREE_ITER_NEED_TRAVERSE = 3,
|
||||
};
|
||||
|
||||
/*
|
||||
* @pos - iterator's current position
|
||||
* @level - current btree depth
|
||||
* @locks_want - btree level below which we start taking intent locks
|
||||
* @nodes_locked - bitmask indicating which nodes in @nodes are locked
|
||||
* @nodes_intent_locked - bitmask indicating which locks are intent locks
|
||||
*/
|
||||
struct btree_iter {
|
||||
struct bch_fs *c;
|
||||
struct bpos pos;
|
||||
|
||||
u8 flags;
|
||||
enum btree_iter_uptodate uptodate:4;
|
||||
enum btree_id btree_id:4;
|
||||
unsigned level:4,
|
||||
locks_want:4,
|
||||
nodes_locked:4,
|
||||
nodes_intent_locked:4;
|
||||
|
||||
struct btree_iter_level {
|
||||
struct btree *b;
|
||||
struct btree_node_iter iter;
|
||||
} l[BTREE_MAX_DEPTH];
|
||||
|
||||
u32 lock_seq[BTREE_MAX_DEPTH];
|
||||
|
||||
/*
|
||||
* Current unpacked key - so that bch2_btree_iter_next()/
|
||||
* bch2_btree_iter_next_slot() can correctly advance pos.
|
||||
*/
|
||||
struct bkey k;
|
||||
|
||||
/*
|
||||
* Circular linked list of linked iterators: linked iterators share
|
||||
* locks (e.g. two linked iterators may have the same node intent
|
||||
* locked, or read and write locked, at the same time), and insertions
|
||||
* through one iterator won't invalidate the other linked iterators.
|
||||
*/
|
||||
|
||||
/* Must come last: */
|
||||
struct btree_iter *next;
|
||||
};
|
||||
|
||||
#define BTREE_ITER_MAX 8
|
||||
|
||||
struct btree_insert_entry {
|
||||
struct btree_iter *iter;
|
||||
struct bkey_i *k;
|
||||
unsigned extra_res;
|
||||
/*
|
||||
* true if entire key was inserted - can only be false for
|
||||
* extents
|
||||
*/
|
||||
bool done;
|
||||
};
|
||||
|
||||
struct btree_trans {
|
||||
struct bch_fs *c;
|
||||
|
||||
u8 nr_iters;
|
||||
u8 iters_live;
|
||||
u8 iters_linked;
|
||||
u8 nr_updates;
|
||||
|
||||
unsigned mem_top;
|
||||
unsigned mem_bytes;
|
||||
void *mem;
|
||||
|
||||
struct btree_iter *iters;
|
||||
u64 iter_ids[BTREE_ITER_MAX];
|
||||
|
||||
struct btree_insert_entry updates[BTREE_ITER_MAX];
|
||||
|
||||
struct btree_iter iters_onstack[2];
|
||||
};
|
||||
|
||||
#define BTREE_FLAG(flag) \
|
||||
static inline bool btree_node_ ## flag(struct btree *b) \
|
||||
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||
\
|
||||
static inline void set_btree_node_ ## flag(struct btree *b) \
|
||||
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
||||
\
|
||||
static inline void clear_btree_node_ ## flag(struct btree *b) \
|
||||
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
|
||||
|
||||
enum btree_flags {
|
||||
BTREE_NODE_read_in_flight,
|
||||
BTREE_NODE_read_error,
|
||||
BTREE_NODE_dirty,
|
||||
BTREE_NODE_need_write,
|
||||
BTREE_NODE_noevict,
|
||||
BTREE_NODE_write_idx,
|
||||
BTREE_NODE_accessed,
|
||||
BTREE_NODE_write_in_flight,
|
||||
BTREE_NODE_just_written,
|
||||
BTREE_NODE_dying,
|
||||
BTREE_NODE_fake,
|
||||
};
|
||||
|
||||
BTREE_FLAG(read_in_flight);
|
||||
BTREE_FLAG(read_error);
|
||||
BTREE_FLAG(dirty);
|
||||
BTREE_FLAG(need_write);
|
||||
BTREE_FLAG(noevict);
|
||||
BTREE_FLAG(write_idx);
|
||||
BTREE_FLAG(accessed);
|
||||
BTREE_FLAG(write_in_flight);
|
||||
BTREE_FLAG(just_written);
|
||||
BTREE_FLAG(dying);
|
||||
BTREE_FLAG(fake);
|
||||
|
||||
static inline struct btree_write *btree_current_write(struct btree *b)
|
||||
{
|
||||
return b->writes + btree_node_write_idx(b);
|
||||
}
|
||||
|
||||
static inline struct btree_write *btree_prev_write(struct btree *b)
|
||||
{
|
||||
return b->writes + (btree_node_write_idx(b) ^ 1);
|
||||
}
|
||||
|
||||
static inline struct bset_tree *bset_tree_last(struct btree *b)
|
||||
{
|
||||
EBUG_ON(!b->nsets);
|
||||
return b->set + b->nsets - 1;
|
||||
}
|
||||
|
||||
static inline struct bset *bset(const struct btree *b,
|
||||
const struct bset_tree *t)
|
||||
{
|
||||
return (void *) b->data + t->data_offset * sizeof(u64);
|
||||
}
|
||||
|
||||
static inline struct bset *btree_bset_first(struct btree *b)
|
||||
{
|
||||
return bset(b, b->set);
|
||||
}
|
||||
|
||||
static inline struct bset *btree_bset_last(struct btree *b)
|
||||
{
|
||||
return bset(b, bset_tree_last(b));
|
||||
}
|
||||
|
||||
static inline u16
|
||||
__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
|
||||
{
|
||||
size_t ret = (u64 *) k - (u64 *) b->data - 1;
|
||||
|
||||
EBUG_ON(ret > U16_MAX);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *
|
||||
__btree_node_offset_to_key(const struct btree *b, u16 k)
|
||||
{
|
||||
return (void *) ((u64 *) b->data + k + 1);
|
||||
}
|
||||
|
||||
#define btree_bkey_first(_b, _t) (bset(_b, _t)->start)
|
||||
|
||||
#define btree_bkey_last(_b, _t) \
|
||||
({ \
|
||||
EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
|
||||
vstruct_last(bset(_b, _t))); \
|
||||
\
|
||||
__btree_node_offset_to_key(_b, (_t)->end_offset); \
|
||||
})
|
||||
|
||||
static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
|
||||
{
|
||||
t->end_offset =
|
||||
__btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
|
||||
btree_bkey_last(b, t);
|
||||
}
|
||||
|
||||
static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
|
||||
const struct bset *i)
|
||||
{
|
||||
t->data_offset = (u64 *) i - (u64 *) b->data;
|
||||
|
||||
EBUG_ON(bset(b, t) != i);
|
||||
|
||||
set_btree_bset_end(b, t);
|
||||
}
|
||||
|
||||
static inline unsigned bset_byte_offset(struct btree *b, void *i)
|
||||
{
|
||||
return i - (void *) b->data;
|
||||
}
|
||||
|
||||
/* Type of keys @b contains: */
|
||||
static inline enum bkey_type btree_node_type(struct btree *b)
|
||||
{
|
||||
return b->level ? BKEY_TYPE_BTREE : b->btree_id;
|
||||
}
|
||||
|
||||
static inline const struct bkey_ops *btree_node_ops(struct btree *b)
|
||||
{
|
||||
return &bch2_bkey_ops[btree_node_type(b)];
|
||||
}
|
||||
|
||||
static inline bool btree_node_has_ptrs(struct btree *b)
|
||||
{
|
||||
return btree_type_has_ptrs(btree_node_type(b));
|
||||
}
|
||||
|
||||
static inline bool btree_node_is_extents(struct btree *b)
|
||||
{
|
||||
return btree_node_type(b) == BKEY_TYPE_EXTENTS;
|
||||
}
|
||||
|
||||
struct btree_root {
|
||||
struct btree *b;
|
||||
|
||||
struct btree_update *as;
|
||||
|
||||
/* On disk root - see async splits: */
|
||||
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
||||
u8 level;
|
||||
u8 alive;
|
||||
};
|
||||
|
||||
/*
|
||||
* Optional hook that will be called just prior to a btree node update, when
|
||||
* we're holding the write lock and we know what key is about to be overwritten:
|
||||
*/
|
||||
|
||||
struct btree_iter;
|
||||
struct btree_node_iter;
|
||||
|
||||
enum btree_insert_ret {
|
||||
BTREE_INSERT_OK,
|
||||
/* extent spanned multiple leaf nodes: have to traverse to next node: */
|
||||
BTREE_INSERT_NEED_TRAVERSE,
|
||||
/* write lock held for too long */
|
||||
BTREE_INSERT_NEED_RESCHED,
|
||||
/* leaf node needs to be split */
|
||||
BTREE_INSERT_BTREE_NODE_FULL,
|
||||
BTREE_INSERT_JOURNAL_RES_FULL,
|
||||
BTREE_INSERT_ENOSPC,
|
||||
BTREE_INSERT_NEED_GC_LOCK,
|
||||
};
|
||||
|
||||
struct extent_insert_hook {
|
||||
enum btree_insert_ret
|
||||
(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
|
||||
struct bkey_s_c, const struct bkey_i *);
|
||||
};
|
||||
|
||||
enum btree_gc_coalesce_fail_reason {
|
||||
BTREE_GC_COALESCE_FAIL_RESERVE_GET,
|
||||
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
|
||||
BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
|
||||
};
|
||||
|
||||
enum btree_node_sibling {
|
||||
btree_prev_sib,
|
||||
btree_next_sib,
|
||||
};
|
||||
|
||||
typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
|
||||
struct btree *,
|
||||
struct btree_node_iter *);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_TYPES_H */
|
168
fs/bcachefs/btree_update.h
Normal file
168
fs/bcachefs/btree_update.h
Normal file
@ -0,0 +1,168 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_UPDATE_H
|
||||
#define _BCACHEFS_BTREE_UPDATE_H
|
||||
|
||||
#include "btree_iter.h"
|
||||
#include "journal.h"
|
||||
|
||||
struct bch_fs;
|
||||
struct btree;
|
||||
struct btree_insert;
|
||||
|
||||
void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
|
||||
struct btree_iter *);
|
||||
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
|
||||
struct btree_node_iter *, struct bkey_i *);
|
||||
void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
|
||||
struct bkey_i *);
|
||||
|
||||
/* Normal update interface: */
|
||||
|
||||
struct btree_insert {
|
||||
struct bch_fs *c;
|
||||
struct disk_reservation *disk_res;
|
||||
struct journal_res journal_res;
|
||||
u64 *journal_seq;
|
||||
struct extent_insert_hook *hook;
|
||||
unsigned flags;
|
||||
bool did_work;
|
||||
|
||||
unsigned short nr;
|
||||
struct btree_insert_entry *entries;
|
||||
};
|
||||
|
||||
int __bch2_btree_insert_at(struct btree_insert *);
|
||||
|
||||
#define BTREE_INSERT_ENTRY(_iter, _k) \
|
||||
((struct btree_insert_entry) { \
|
||||
.iter = (_iter), \
|
||||
.k = (_k), \
|
||||
.done = false, \
|
||||
})
|
||||
|
||||
#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \
|
||||
((struct btree_insert_entry) { \
|
||||
.iter = (_iter), \
|
||||
.k = (_k), \
|
||||
.extra_res = (_extra), \
|
||||
.done = false, \
|
||||
})
|
||||
|
||||
/**
|
||||
* bch_btree_insert_at - insert one or more keys at iterator positions
|
||||
* @iter: btree iterator
|
||||
* @insert_key: key to insert
|
||||
* @disk_res: disk reservation
|
||||
* @hook: extent insert callback
|
||||
*
|
||||
* Return values:
|
||||
* -EINTR: locking changed, this function should be called again. Only returned
|
||||
* if passed BTREE_INSERT_ATOMIC.
|
||||
* -EROFS: filesystem read only
|
||||
* -EIO: journal or btree node IO error
|
||||
*/
|
||||
#define bch2_btree_insert_at(_c, _disk_res, _hook, \
|
||||
_journal_seq, _flags, ...) \
|
||||
__bch2_btree_insert_at(&(struct btree_insert) { \
|
||||
.c = (_c), \
|
||||
.disk_res = (_disk_res), \
|
||||
.journal_seq = (_journal_seq), \
|
||||
.hook = (_hook), \
|
||||
.flags = (_flags), \
|
||||
.nr = COUNT_ARGS(__VA_ARGS__), \
|
||||
.entries = (struct btree_insert_entry[]) { \
|
||||
__VA_ARGS__ \
|
||||
}})
|
||||
|
||||
enum {
|
||||
__BTREE_INSERT_ATOMIC,
|
||||
__BTREE_INSERT_NOUNLOCK,
|
||||
__BTREE_INSERT_NOFAIL,
|
||||
__BTREE_INSERT_USE_RESERVE,
|
||||
__BTREE_INSERT_USE_ALLOC_RESERVE,
|
||||
__BTREE_INSERT_JOURNAL_REPLAY,
|
||||
__BTREE_INSERT_NOWAIT,
|
||||
__BTREE_INSERT_GC_LOCK_HELD,
|
||||
__BCH_HASH_SET_MUST_CREATE,
|
||||
__BCH_HASH_SET_MUST_REPLACE,
|
||||
};
|
||||
|
||||
/*
|
||||
* Don't drop/retake locks before doing btree update, instead return -EINTR if
|
||||
* we had to drop locks for any reason
|
||||
*/
|
||||
#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC)
|
||||
|
||||
/*
|
||||
* Don't drop locks _after_ successfully updating btree:
|
||||
*/
|
||||
#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
|
||||
|
||||
/* Don't check for -ENOSPC: */
|
||||
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
|
||||
|
||||
/* for copygc, or when merging btree nodes */
|
||||
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
|
||||
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
|
||||
|
||||
/*
|
||||
* Insert is for journal replay: don't get journal reservations, or mark extents
|
||||
* (bch_mark_key)
|
||||
*/
|
||||
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
|
||||
|
||||
/* Don't block on allocation failure (for new btree nodes: */
|
||||
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
|
||||
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
|
||||
|
||||
#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
|
||||
#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
|
||||
|
||||
int bch2_btree_delete_at(struct btree_iter *, unsigned);
|
||||
|
||||
int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
|
||||
struct disk_reservation *,
|
||||
struct extent_insert_hook *, u64 *, unsigned);
|
||||
|
||||
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
|
||||
struct disk_reservation *,
|
||||
struct extent_insert_hook *, u64 *, int flags);
|
||||
|
||||
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
|
||||
struct bpos, struct bpos, struct bversion,
|
||||
struct disk_reservation *,
|
||||
struct extent_insert_hook *, u64 *);
|
||||
|
||||
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
|
||||
__le64, unsigned);
|
||||
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
|
||||
struct btree *, struct bkey_i_extent *);
|
||||
|
||||
/* new transactional interface: */
|
||||
|
||||
void bch2_trans_update(struct btree_trans *, struct btree_iter *,
|
||||
struct bkey_i *, unsigned);
|
||||
int bch2_trans_commit(struct btree_trans *,
|
||||
struct disk_reservation *,
|
||||
struct extent_insert_hook *,
|
||||
u64 *, unsigned);
|
||||
|
||||
#define bch2_trans_do(_c, _journal_seq, _flags, _do) \
|
||||
({ \
|
||||
struct btree_trans trans; \
|
||||
int _ret; \
|
||||
\
|
||||
bch2_trans_init(&trans, (_c)); \
|
||||
\
|
||||
do { \
|
||||
bch2_trans_begin(&trans); \
|
||||
\
|
||||
_ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL, \
|
||||
(_journal_seq), (_flags)); \
|
||||
} while (_ret == -EINTR); \
|
||||
\
|
||||
bch2_trans_exit(&trans); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_UPDATE_H */
|
2171
fs/bcachefs/btree_update_interior.c
Normal file
2171
fs/bcachefs/btree_update_interior.c
Normal file
File diff suppressed because it is too large
Load Diff
374
fs/bcachefs/btree_update_interior.h
Normal file
374
fs/bcachefs/btree_update_interior.h
Normal file
@ -0,0 +1,374 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
|
||||
#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
|
||||
|
||||
#include "btree_cache.h"
|
||||
#include "btree_locking.h"
|
||||
#include "btree_update.h"
|
||||
|
||||
struct btree_reserve {
|
||||
struct disk_reservation disk_res;
|
||||
unsigned nr;
|
||||
struct btree *b[BTREE_RESERVE_MAX];
|
||||
};
|
||||
|
||||
void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
|
||||
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
|
||||
struct bkey_format *);
|
||||
|
||||
/* Btree node freeing/allocation: */
|
||||
|
||||
/*
|
||||
* Tracks a btree node that has been (or is about to be) freed in memory, but
|
||||
* has _not_ yet been freed on disk (because the write that makes the new
|
||||
* node(s) visible and frees the old hasn't completed yet)
|
||||
*/
|
||||
struct pending_btree_node_free {
|
||||
bool index_update_done;
|
||||
|
||||
__le64 seq;
|
||||
enum btree_id btree_id;
|
||||
unsigned level;
|
||||
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
||||
};
|
||||
|
||||
/*
|
||||
* Tracks an in progress split/rewrite of a btree node and the update to the
|
||||
* parent node:
|
||||
*
|
||||
* When we split/rewrite a node, we do all the updates in memory without
|
||||
* waiting for any writes to complete - we allocate the new node(s) and update
|
||||
* the parent node, possibly recursively up to the root.
|
||||
*
|
||||
* The end result is that we have one or more new nodes being written -
|
||||
* possibly several, if there were multiple splits - and then a write (updating
|
||||
* an interior node) which will make all these new nodes visible.
|
||||
*
|
||||
* Additionally, as we split/rewrite nodes we free the old nodes - but the old
|
||||
* nodes can't be freed (their space on disk can't be reclaimed) until the
|
||||
* update to the interior node that makes the new node visible completes -
|
||||
* until then, the old nodes are still reachable on disk.
|
||||
*
|
||||
*/
|
||||
struct btree_update {
|
||||
struct closure cl;
|
||||
struct bch_fs *c;
|
||||
|
||||
struct list_head list;
|
||||
|
||||
/* What kind of update are we doing? */
|
||||
enum {
|
||||
BTREE_INTERIOR_NO_UPDATE,
|
||||
BTREE_INTERIOR_UPDATING_NODE,
|
||||
BTREE_INTERIOR_UPDATING_ROOT,
|
||||
BTREE_INTERIOR_UPDATING_AS,
|
||||
} mode;
|
||||
|
||||
unsigned must_rewrite:1;
|
||||
unsigned nodes_written:1;
|
||||
|
||||
enum btree_id btree_id;
|
||||
|
||||
struct btree_reserve *reserve;
|
||||
|
||||
/*
|
||||
* BTREE_INTERIOR_UPDATING_NODE:
|
||||
* The update that made the new nodes visible was a regular update to an
|
||||
* existing interior node - @b. We can't write out the update to @b
|
||||
* until the new nodes we created are finished writing, so we block @b
|
||||
* from writing by putting this btree_interior update on the
|
||||
* @b->write_blocked list with @write_blocked_list:
|
||||
*/
|
||||
struct btree *b;
|
||||
struct list_head write_blocked_list;
|
||||
|
||||
/*
|
||||
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
|
||||
* we're now blocking another btree_update
|
||||
* @parent_as - btree_update that's waiting on our nodes to finish
|
||||
* writing, before it can make new nodes visible on disk
|
||||
* @wait - list of child btree_updates that are waiting on this
|
||||
* btree_update to make all the new nodes visible before they can free
|
||||
* their old btree nodes
|
||||
*/
|
||||
struct btree_update *parent_as;
|
||||
struct closure_waitlist wait;
|
||||
|
||||
/*
|
||||
* We may be freeing nodes that were dirty, and thus had journal entries
|
||||
* pinned: we need to transfer the oldest of those pins to the
|
||||
* btree_update operation, and release it when the new node(s)
|
||||
* are all persistent and reachable:
|
||||
*/
|
||||
struct journal_entry_pin journal;
|
||||
|
||||
u64 journal_seq;
|
||||
|
||||
/*
|
||||
* Nodes being freed:
|
||||
* Protected by c->btree_node_pending_free_lock
|
||||
*/
|
||||
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
|
||||
unsigned nr_pending;
|
||||
|
||||
/* New nodes, that will be made reachable by this update: */
|
||||
struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
|
||||
unsigned nr_new_nodes;
|
||||
|
||||
/* Only here to reduce stack usage on recursive splits: */
|
||||
struct keylist parent_keys;
|
||||
/*
|
||||
* Enough room for btree_split's keys without realloc - btree node
|
||||
* pointers never have crc/compression info, so we only need to acount
|
||||
* for the pointers for three keys
|
||||
*/
|
||||
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
|
||||
};
|
||||
|
||||
#define for_each_pending_btree_node_free(c, as, p) \
|
||||
list_for_each_entry(as, &c->btree_interior_update_list, list) \
|
||||
for (p = as->pending; p < as->pending + as->nr_pending; p++)
|
||||
|
||||
void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
|
||||
struct btree_iter *);
|
||||
void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
|
||||
void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
|
||||
|
||||
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
|
||||
struct btree *,
|
||||
struct bkey_format);
|
||||
|
||||
void bch2_btree_update_done(struct btree_update *);
|
||||
struct btree_update *
|
||||
bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
|
||||
unsigned, struct closure *);
|
||||
|
||||
void bch2_btree_interior_update_will_free_node(struct btree_update *,
|
||||
struct btree *);
|
||||
|
||||
void bch2_btree_insert_node(struct btree_update *, struct btree *,
|
||||
struct btree_iter *, struct keylist *,
|
||||
unsigned);
|
||||
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
|
||||
|
||||
void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
|
||||
unsigned, unsigned, enum btree_node_sibling);
|
||||
|
||||
static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
unsigned level, unsigned flags,
|
||||
enum btree_node_sibling sib)
|
||||
{
|
||||
struct btree *b;
|
||||
|
||||
/*
|
||||
* iterators are inconsistent when they hit end of leaf, until
|
||||
* traversed again
|
||||
*
|
||||
* XXX inconsistent how?
|
||||
*/
|
||||
if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
|
||||
return;
|
||||
|
||||
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
|
||||
return;
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level))
|
||||
return;
|
||||
|
||||
b = iter->l[level].b;
|
||||
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
|
||||
return;
|
||||
|
||||
__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
|
||||
}
|
||||
|
||||
static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
unsigned level,
|
||||
unsigned flags)
|
||||
{
|
||||
bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
|
||||
btree_prev_sib);
|
||||
bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
|
||||
btree_next_sib);
|
||||
}
|
||||
|
||||
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
|
||||
void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
|
||||
|
||||
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
unsigned depth = btree_node_root(c, b)->level + 1;
|
||||
|
||||
/*
|
||||
* Number of nodes we might have to allocate in a worst case btree
|
||||
* split operation - we split all the way up to the root, then allocate
|
||||
* a new root, unless we're already at max depth:
|
||||
*/
|
||||
if (depth < BTREE_MAX_DEPTH)
|
||||
return (depth - b->level) * 2 + 1;
|
||||
else
|
||||
return (depth - b->level) * 2 - 1;
|
||||
}
|
||||
|
||||
static inline void btree_node_reset_sib_u64s(struct btree *b)
|
||||
{
|
||||
b->sib_u64s[0] = b->nr.live_u64s;
|
||||
b->sib_u64s[1] = b->nr.live_u64s;
|
||||
}
|
||||
|
||||
static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
return (void *) b->data + btree_bytes(c);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
|
||||
}
|
||||
|
||||
static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
return btree_data_end(c, b);
|
||||
}
|
||||
|
||||
static inline void *write_block(struct btree *b)
|
||||
{
|
||||
return (void *) b->data + (b->written << 9);
|
||||
}
|
||||
|
||||
static inline bool bset_written(struct btree *b, struct bset *i)
|
||||
{
|
||||
return (void *) i < write_block(b);
|
||||
}
|
||||
|
||||
static inline bool bset_unwritten(struct btree *b, struct bset *i)
|
||||
{
|
||||
return (void *) i > write_block(b);
|
||||
}
|
||||
|
||||
static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
|
||||
struct btree *b,
|
||||
void *end)
|
||||
{
|
||||
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
|
||||
b->whiteout_u64s +
|
||||
b->uncompacted_whiteout_u64s;
|
||||
ssize_t total = c->opts.btree_node_size << 6;
|
||||
|
||||
return total - used;
|
||||
}
|
||||
|
||||
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
ssize_t remaining = __bch_btree_u64s_remaining(c, b,
|
||||
btree_bkey_last(b, bset_tree_last(b)));
|
||||
|
||||
BUG_ON(remaining < 0);
|
||||
|
||||
if (bset_written(b, btree_bset_last(b)))
|
||||
return 0;
|
||||
|
||||
return remaining;
|
||||
}
|
||||
|
||||
static inline unsigned btree_write_set_buffer(struct btree *b)
|
||||
{
|
||||
/*
|
||||
* Could buffer up larger amounts of keys for btrees with larger keys,
|
||||
* pending benchmarking:
|
||||
*/
|
||||
return 4 << 10;
|
||||
}
|
||||
|
||||
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bset *i = btree_bset_last(b);
|
||||
struct btree_node_entry *bne = max(write_block(b),
|
||||
(void *) btree_bkey_last(b, bset_tree_last(b)));
|
||||
ssize_t remaining_space =
|
||||
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
|
||||
|
||||
if (unlikely(bset_written(b, i))) {
|
||||
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
|
||||
return bne;
|
||||
} else {
|
||||
if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
|
||||
remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
|
||||
return bne;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
|
||||
struct bkey_packed *k)
|
||||
{
|
||||
if (bset_written(b, bset(b, t))) {
|
||||
EBUG_ON(b->uncompacted_whiteout_u64s <
|
||||
bkeyp_key_u64s(&b->format, k));
|
||||
b->uncompacted_whiteout_u64s -=
|
||||
bkeyp_key_u64s(&b->format, k);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
|
||||
struct bkey_packed *k)
|
||||
{
|
||||
if (bset_written(b, bset(b, t))) {
|
||||
BUG_ON(!k->needs_whiteout);
|
||||
b->uncompacted_whiteout_u64s +=
|
||||
bkeyp_key_u64s(&b->format, k);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* write lock must be held on @b (else the dirty bset that we were going to
|
||||
* insert into could be written out from under us)
|
||||
*/
|
||||
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
|
||||
struct btree *b, unsigned u64s)
|
||||
{
|
||||
if (unlikely(btree_node_fake(b)))
|
||||
return false;
|
||||
|
||||
if (btree_node_is_extents(b)) {
|
||||
/* The insert key might split an existing key
|
||||
* (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
|
||||
*/
|
||||
u64s += BKEY_EXTENT_U64s_MAX;
|
||||
}
|
||||
|
||||
return u64s <= bch_btree_keys_u64s_remaining(c, b);
|
||||
}
|
||||
|
||||
static inline bool journal_res_insert_fits(struct btree_insert *trans,
|
||||
struct btree_insert_entry *insert)
|
||||
{
|
||||
unsigned u64s = 0;
|
||||
struct btree_insert_entry *i;
|
||||
|
||||
/*
|
||||
* If we didn't get a journal reservation, we're in journal replay and
|
||||
* we're not journalling updates:
|
||||
*/
|
||||
if (!trans->journal_res.ref)
|
||||
return true;
|
||||
|
||||
for (i = insert; i < trans->entries + trans->nr; i++)
|
||||
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
|
||||
|
||||
return u64s <= trans->journal_res.u64s;
|
||||
}
|
||||
|
||||
ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
|
||||
|
||||
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
|
737
fs/bcachefs/btree_update_leaf.c
Normal file
737
fs/bcachefs/btree_update_leaf.c
Normal file
@ -0,0 +1,737 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_io.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_locking.h"
|
||||
#include "debug.h"
|
||||
#include "extents.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/sort.h>
|
||||
|
||||
/* Inserting into a given leaf node (last stage of insert): */
|
||||
|
||||
/* Handle overwrites and do insert, for non extents: */
|
||||
bool bch2_btree_bset_insert_key(struct btree_iter *iter,
|
||||
struct btree *b,
|
||||
struct btree_node_iter *node_iter,
|
||||
struct bkey_i *insert)
|
||||
{
|
||||
const struct bkey_format *f = &b->format;
|
||||
struct bkey_packed *k;
|
||||
struct bset_tree *t;
|
||||
unsigned clobber_u64s;
|
||||
|
||||
EBUG_ON(btree_node_just_written(b));
|
||||
EBUG_ON(bset_written(b, btree_bset_last(b)));
|
||||
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
|
||||
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
|
||||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
|
||||
|
||||
k = bch2_btree_node_iter_peek_all(node_iter, b);
|
||||
if (k && !bkey_cmp_packed(b, k, &insert->k)) {
|
||||
BUG_ON(bkey_whiteout(k));
|
||||
|
||||
t = bch2_bkey_to_bset(b, k);
|
||||
|
||||
if (bset_unwritten(b, bset(b, t)) &&
|
||||
bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
|
||||
!bkey_whiteout(&insert->k)) {
|
||||
k->type = insert->k.type;
|
||||
memcpy_u64s(bkeyp_val(f, k), &insert->v,
|
||||
bkey_val_u64s(&insert->k));
|
||||
return true;
|
||||
}
|
||||
|
||||
insert->k.needs_whiteout = k->needs_whiteout;
|
||||
|
||||
btree_keys_account_key_drop(&b->nr, t - b->set, k);
|
||||
|
||||
if (t == bset_tree_last(b)) {
|
||||
clobber_u64s = k->u64s;
|
||||
|
||||
/*
|
||||
* If we're deleting, and the key we're deleting doesn't
|
||||
* need a whiteout (it wasn't overwriting a key that had
|
||||
* been written to disk) - just delete it:
|
||||
*/
|
||||
if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
|
||||
bch2_bset_delete(b, k, clobber_u64s);
|
||||
bch2_btree_node_iter_fix(iter, b, node_iter, t,
|
||||
k, clobber_u64s, 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
goto overwrite;
|
||||
}
|
||||
|
||||
k->type = KEY_TYPE_DELETED;
|
||||
bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
|
||||
k->u64s, k->u64s);
|
||||
|
||||
if (bkey_whiteout(&insert->k)) {
|
||||
reserve_whiteout(b, t, k);
|
||||
return true;
|
||||
} else {
|
||||
k->needs_whiteout = false;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Deleting, but the key to delete wasn't found - nothing to do:
|
||||
*/
|
||||
if (bkey_whiteout(&insert->k))
|
||||
return false;
|
||||
|
||||
insert->k.needs_whiteout = false;
|
||||
}
|
||||
|
||||
t = bset_tree_last(b);
|
||||
k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
|
||||
clobber_u64s = 0;
|
||||
overwrite:
|
||||
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
|
||||
if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
|
||||
bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
|
||||
clobber_u64s, k->u64s);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
|
||||
unsigned i, u64 seq)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct btree_write *w = container_of(pin, struct btree_write, journal);
|
||||
struct btree *b = container_of(w, struct btree, writes[i]);
|
||||
|
||||
btree_node_lock_type(c, b, SIX_LOCK_read);
|
||||
bch2_btree_node_write_cond(c, b,
|
||||
(btree_current_write(b) == w &&
|
||||
w->journal.pin_list == journal_seq_pin(j, seq)));
|
||||
six_unlock_read(&b->lock);
|
||||
}
|
||||
|
||||
static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
return __btree_node_flush(j, pin, 0, seq);
|
||||
}
|
||||
|
||||
static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
return __btree_node_flush(j, pin, 1, seq);
|
||||
}
|
||||
|
||||
void bch2_btree_journal_key(struct btree_insert *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_i *insert)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct journal *j = &c->journal;
|
||||
struct btree *b = iter->l[0].b;
|
||||
struct btree_write *w = btree_current_write(b);
|
||||
|
||||
EBUG_ON(iter->level || b->level);
|
||||
EBUG_ON(trans->journal_res.ref !=
|
||||
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
|
||||
|
||||
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
|
||||
u64 seq = trans->journal_res.seq;
|
||||
bool needs_whiteout = insert->k.needs_whiteout;
|
||||
|
||||
/* ick */
|
||||
insert->k.needs_whiteout = false;
|
||||
bch2_journal_add_keys(j, &trans->journal_res,
|
||||
iter->btree_id, insert);
|
||||
insert->k.needs_whiteout = needs_whiteout;
|
||||
|
||||
bch2_journal_set_has_inode(j, &trans->journal_res,
|
||||
insert->k.p.inode);
|
||||
|
||||
if (trans->journal_seq)
|
||||
*trans->journal_seq = seq;
|
||||
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
|
||||
}
|
||||
|
||||
if (unlikely(!journal_pin_active(&w->journal))) {
|
||||
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
|
||||
? trans->journal_res.seq
|
||||
: j->replay_journal_seq;
|
||||
|
||||
bch2_journal_pin_add(j, seq, &w->journal,
|
||||
btree_node_write_idx(b) == 0
|
||||
? btree_node_flush0
|
||||
: btree_node_flush1);
|
||||
}
|
||||
|
||||
if (unlikely(!btree_node_dirty(b)))
|
||||
set_btree_node_dirty(b);
|
||||
}
|
||||
|
||||
static enum btree_insert_ret
|
||||
bch2_insert_fixup_key(struct btree_insert *trans,
|
||||
struct btree_insert_entry *insert)
|
||||
{
|
||||
struct btree_iter *iter = insert->iter;
|
||||
struct btree_iter_level *l = &iter->l[0];
|
||||
|
||||
EBUG_ON(iter->level);
|
||||
EBUG_ON(insert->k->k.u64s >
|
||||
bch_btree_keys_u64s_remaining(trans->c, l->b));
|
||||
|
||||
if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
|
||||
insert->k))
|
||||
bch2_btree_journal_key(trans, iter, insert->k);
|
||||
|
||||
trans->did_work = true;
|
||||
return BTREE_INSERT_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* btree_insert_key - insert a key one key into a leaf node
|
||||
*/
|
||||
static enum btree_insert_ret
|
||||
btree_insert_key_leaf(struct btree_insert *trans,
|
||||
struct btree_insert_entry *insert)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter *iter = insert->iter;
|
||||
struct btree *b = iter->l[0].b;
|
||||
enum btree_insert_ret ret;
|
||||
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
|
||||
int old_live_u64s = b->nr.live_u64s;
|
||||
int live_u64s_added, u64s_added;
|
||||
|
||||
ret = !btree_node_is_extents(b)
|
||||
? bch2_insert_fixup_key(trans, insert)
|
||||
: bch2_insert_fixup_extent(trans, insert);
|
||||
|
||||
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
|
||||
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
|
||||
|
||||
if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
|
||||
b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
|
||||
if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
|
||||
b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
|
||||
|
||||
if (u64s_added > live_u64s_added &&
|
||||
bch2_maybe_compact_whiteouts(c, b))
|
||||
bch2_btree_iter_reinit_node(iter, b);
|
||||
|
||||
trace_btree_insert_key(c, b, insert->k);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define trans_for_each_entry(trans, i) \
|
||||
for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
|
||||
|
||||
/*
|
||||
* We sort transaction entries so that if multiple iterators point to the same
|
||||
* leaf node they'll be adjacent:
|
||||
*/
|
||||
static bool same_leaf_as_prev(struct btree_insert *trans,
|
||||
struct btree_insert_entry *i)
|
||||
{
|
||||
return i != trans->entries &&
|
||||
i[0].iter->l[0].b == i[-1].iter->l[0].b;
|
||||
}
|
||||
|
||||
static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
|
||||
struct btree_insert_entry *i)
|
||||
{
|
||||
struct btree *b = i->iter->l[0].b;
|
||||
|
||||
do {
|
||||
i++;
|
||||
} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
#define trans_for_each_leaf(trans, i) \
|
||||
for ((i) = (trans)->entries; \
|
||||
(i) < (trans)->entries + (trans)->nr; \
|
||||
(i) = trans_next_leaf(trans, i))
|
||||
|
||||
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
|
||||
struct btree_iter *iter)
|
||||
{
|
||||
bch2_btree_node_lock_write(b, iter);
|
||||
|
||||
if (btree_node_just_written(b) &&
|
||||
bch2_btree_post_write_cleanup(c, b))
|
||||
bch2_btree_iter_reinit_node(iter, b);
|
||||
|
||||
/*
|
||||
* If the last bset has been written, or if it's gotten too big - start
|
||||
* a new bset to insert into:
|
||||
*/
|
||||
if (want_new_bset(c, b))
|
||||
bch2_btree_init_next(c, b, iter);
|
||||
}
|
||||
|
||||
static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
|
||||
{
|
||||
struct btree_insert_entry *i;
|
||||
|
||||
trans_for_each_leaf(trans, i)
|
||||
bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
|
||||
}
|
||||
|
||||
static void multi_unlock_write(struct btree_insert *trans)
|
||||
{
|
||||
struct btree_insert_entry *i;
|
||||
|
||||
trans_for_each_leaf(trans, i)
|
||||
bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
|
||||
}
|
||||
|
||||
static inline int btree_trans_cmp(struct btree_insert_entry l,
|
||||
struct btree_insert_entry r)
|
||||
{
|
||||
return btree_iter_cmp(l.iter, r.iter);
|
||||
}
|
||||
|
||||
/* Normal update interface: */
|
||||
|
||||
/*
|
||||
* Get journal reservation, take write locks, and attempt to do btree update(s):
|
||||
*/
|
||||
static inline int do_btree_insert_at(struct btree_insert *trans,
|
||||
struct btree_iter **split,
|
||||
bool *cycle_gc_lock)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_insert_entry *i;
|
||||
unsigned u64s;
|
||||
int ret;
|
||||
|
||||
trans_for_each_entry(trans, i) {
|
||||
BUG_ON(i->done);
|
||||
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
|
||||
}
|
||||
|
||||
u64s = 0;
|
||||
trans_for_each_entry(trans, i)
|
||||
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
|
||||
|
||||
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
|
||||
|
||||
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
|
||||
? bch2_journal_res_get(&c->journal,
|
||||
&trans->journal_res,
|
||||
u64s, u64s)
|
||||
: 0;
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
multi_lock_write(c, trans);
|
||||
|
||||
if (race_fault()) {
|
||||
ret = -EINTR;
|
||||
goto out;
|
||||
}
|
||||
|
||||
u64s = 0;
|
||||
trans_for_each_entry(trans, i) {
|
||||
/* Multiple inserts might go to same leaf: */
|
||||
if (!same_leaf_as_prev(trans, i))
|
||||
u64s = 0;
|
||||
|
||||
/*
|
||||
* bch2_btree_node_insert_fits() must be called under write lock:
|
||||
* with only an intent lock, another thread can still call
|
||||
* bch2_btree_node_write(), converting an unwritten bset to a
|
||||
* written one
|
||||
*/
|
||||
u64s += i->k->k.u64s + i->extra_res;
|
||||
if (!bch2_btree_node_insert_fits(c,
|
||||
i->iter->l[0].b, u64s)) {
|
||||
ret = -EINTR;
|
||||
*split = i->iter;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
|
||||
if (journal_seq_verify(c))
|
||||
trans_for_each_entry(trans, i)
|
||||
i->k->k.version.lo = trans->journal_res.seq;
|
||||
else if (inject_invalid_keys(c))
|
||||
trans_for_each_entry(trans, i)
|
||||
i->k->k.version = MAX_VERSION;
|
||||
}
|
||||
|
||||
trans_for_each_entry(trans, i) {
|
||||
switch (btree_insert_key_leaf(trans, i)) {
|
||||
case BTREE_INSERT_OK:
|
||||
i->done = true;
|
||||
break;
|
||||
case BTREE_INSERT_JOURNAL_RES_FULL:
|
||||
case BTREE_INSERT_NEED_TRAVERSE:
|
||||
case BTREE_INSERT_NEED_RESCHED:
|
||||
ret = -EINTR;
|
||||
break;
|
||||
case BTREE_INSERT_BTREE_NODE_FULL:
|
||||
ret = -EINTR;
|
||||
*split = i->iter;
|
||||
break;
|
||||
case BTREE_INSERT_ENOSPC:
|
||||
ret = -ENOSPC;
|
||||
break;
|
||||
case BTREE_INSERT_NEED_GC_LOCK:
|
||||
ret = -EINTR;
|
||||
*cycle_gc_lock = true;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
/*
|
||||
* If we did some work (i.e. inserted part of an extent),
|
||||
* we have to do all the other updates as well:
|
||||
*/
|
||||
if (!trans->did_work && (ret || *split))
|
||||
break;
|
||||
}
|
||||
out:
|
||||
multi_unlock_write(trans);
|
||||
bch2_journal_res_put(&c->journal, &trans->journal_res);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void btree_insert_entry_checks(struct bch_fs *c,
|
||||
struct btree_insert_entry *i)
|
||||
{
|
||||
BUG_ON(i->iter->level);
|
||||
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
|
||||
BUG_ON(debug_check_bkeys(c) &&
|
||||
!bkey_deleted(&i->k->k) &&
|
||||
bch2_bkey_invalid(c, (enum bkey_type) i->iter->btree_id,
|
||||
bkey_i_to_s_c(i->k)));
|
||||
}
|
||||
|
||||
/**
|
||||
* __bch_btree_insert_at - insert keys at given iterator positions
|
||||
*
|
||||
* This is main entry point for btree updates.
|
||||
*
|
||||
* Return values:
|
||||
* -EINTR: locking changed, this function should be called again. Only returned
|
||||
* if passed BTREE_INSERT_ATOMIC.
|
||||
* -EROFS: filesystem read only
|
||||
* -EIO: journal or btree node IO error
|
||||
*/
|
||||
int __bch2_btree_insert_at(struct btree_insert *trans)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_insert_entry *i;
|
||||
struct btree_iter *linked, *split = NULL;
|
||||
bool cycle_gc_lock = false;
|
||||
unsigned flags;
|
||||
int ret;
|
||||
|
||||
BUG_ON(!trans->nr);
|
||||
|
||||
for_each_btree_iter(trans->entries[0].iter, linked)
|
||||
bch2_btree_iter_verify_locks(linked);
|
||||
|
||||
/* for the sake of sanity: */
|
||||
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
|
||||
|
||||
trans_for_each_entry(trans, i)
|
||||
btree_insert_entry_checks(c, i);
|
||||
|
||||
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
|
||||
|
||||
if (unlikely(!percpu_ref_tryget(&c->writes)))
|
||||
return -EROFS;
|
||||
retry:
|
||||
split = NULL;
|
||||
cycle_gc_lock = false;
|
||||
|
||||
trans_for_each_entry(trans, i) {
|
||||
if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
|
||||
ret = -EINTR;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (i->iter->flags & BTREE_ITER_ERROR) {
|
||||
ret = -EIO;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
trans_for_each_leaf(trans, i)
|
||||
bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
|
||||
|
||||
trans_for_each_entry(trans, i)
|
||||
bch2_btree_iter_downgrade(i->iter);
|
||||
out:
|
||||
percpu_ref_put(&c->writes);
|
||||
|
||||
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
|
||||
/* make sure we didn't drop or screw up locks: */
|
||||
for_each_btree_iter(trans->entries[0].iter, linked) {
|
||||
bch2_btree_iter_verify_locks(linked);
|
||||
BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
|
||||
trans->did_work &&
|
||||
linked->uptodate >= BTREE_ITER_NEED_RELOCK);
|
||||
}
|
||||
|
||||
/* make sure we didn't lose an error: */
|
||||
if (!ret)
|
||||
trans_for_each_entry(trans, i)
|
||||
BUG_ON(!i->done);
|
||||
}
|
||||
|
||||
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
|
||||
|
||||
return ret;
|
||||
err:
|
||||
flags = trans->flags;
|
||||
|
||||
/*
|
||||
* BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
|
||||
* update; if we haven't done anything yet it doesn't apply
|
||||
*/
|
||||
if (!trans->did_work)
|
||||
flags &= ~BTREE_INSERT_NOUNLOCK;
|
||||
|
||||
if (split) {
|
||||
ret = bch2_btree_split_leaf(c, split, flags);
|
||||
|
||||
/*
|
||||
* if the split succeeded without dropping locks the insert will
|
||||
* still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
|
||||
* caller peeked() and is overwriting won't have changed)
|
||||
*/
|
||||
#if 0
|
||||
/*
|
||||
* XXX:
|
||||
* split -> btree node merging (of parent node) might still drop
|
||||
* locks when we're not passing it BTREE_INSERT_NOUNLOCK
|
||||
*/
|
||||
if (!ret && !trans->did_work)
|
||||
goto retry;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* don't care if we got ENOSPC because we told split it
|
||||
* couldn't block:
|
||||
*/
|
||||
if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
|
||||
ret = -EINTR;
|
||||
}
|
||||
|
||||
if (cycle_gc_lock) {
|
||||
if (!down_read_trylock(&c->gc_lock)) {
|
||||
if (flags & BTREE_INSERT_NOUNLOCK)
|
||||
goto out;
|
||||
|
||||
bch2_btree_iter_unlock(trans->entries[0].iter);
|
||||
down_read(&c->gc_lock);
|
||||
}
|
||||
up_read(&c->gc_lock);
|
||||
}
|
||||
|
||||
if (ret == -EINTR) {
|
||||
if (flags & BTREE_INSERT_NOUNLOCK)
|
||||
goto out;
|
||||
|
||||
trans_for_each_entry(trans, i) {
|
||||
int ret2 = bch2_btree_iter_traverse(i->iter);
|
||||
if (ret2) {
|
||||
ret = ret2;
|
||||
goto out;
|
||||
}
|
||||
|
||||
BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
|
||||
}
|
||||
|
||||
/*
|
||||
* BTREE_ITER_ATOMIC means we have to return -EINTR if we
|
||||
* dropped locks:
|
||||
*/
|
||||
if (!(flags & BTREE_INSERT_ATOMIC))
|
||||
goto retry;
|
||||
}
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
void bch2_trans_update(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_i *k,
|
||||
unsigned extra_journal_res)
|
||||
{
|
||||
struct btree_insert_entry *i;
|
||||
|
||||
BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
|
||||
|
||||
i = &trans->updates[trans->nr_updates++];
|
||||
|
||||
*i = (struct btree_insert_entry) {
|
||||
.iter = iter,
|
||||
.k = k,
|
||||
.extra_res = extra_journal_res,
|
||||
};
|
||||
|
||||
btree_insert_entry_checks(trans->c, i);
|
||||
}
|
||||
|
||||
int bch2_trans_commit(struct btree_trans *trans,
|
||||
struct disk_reservation *disk_res,
|
||||
struct extent_insert_hook *hook,
|
||||
u64 *journal_seq,
|
||||
unsigned flags)
|
||||
{
|
||||
struct btree_insert insert = {
|
||||
.c = trans->c,
|
||||
.disk_res = disk_res,
|
||||
.journal_seq = journal_seq,
|
||||
.flags = flags,
|
||||
.nr = trans->nr_updates,
|
||||
.entries = trans->updates,
|
||||
};
|
||||
|
||||
if (!trans->nr_updates)
|
||||
return 0;
|
||||
|
||||
trans->nr_updates = 0;
|
||||
|
||||
return __bch2_btree_insert_at(&insert);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
|
||||
{
|
||||
struct bkey_i k;
|
||||
|
||||
bkey_init(&k.k);
|
||||
k.k.p = iter->pos;
|
||||
|
||||
return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_USE_RESERVE|flags,
|
||||
BTREE_INSERT_ENTRY(iter, &k));
|
||||
}
|
||||
|
||||
int bch2_btree_insert_list_at(struct btree_iter *iter,
|
||||
struct keylist *keys,
|
||||
struct disk_reservation *disk_res,
|
||||
struct extent_insert_hook *hook,
|
||||
u64 *journal_seq, unsigned flags)
|
||||
{
|
||||
BUG_ON(flags & BTREE_INSERT_ATOMIC);
|
||||
BUG_ON(bch2_keylist_empty(keys));
|
||||
bch2_verify_keylist_sorted(keys);
|
||||
|
||||
while (!bch2_keylist_empty(keys)) {
|
||||
int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
|
||||
journal_seq, flags,
|
||||
BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_keylist_pop_front(keys);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch_btree_insert - insert keys into the extent btree
|
||||
* @c: pointer to struct bch_fs
|
||||
* @id: btree to insert into
|
||||
* @insert_keys: list of keys to insert
|
||||
* @hook: insert callback
|
||||
*/
|
||||
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
|
||||
struct bkey_i *k,
|
||||
struct disk_reservation *disk_res,
|
||||
struct extent_insert_hook *hook,
|
||||
u64 *journal_seq, int flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
int ret;
|
||||
|
||||
bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
|
||||
BTREE_INSERT_ENTRY(&iter, k));
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* bch_btree_delete_range - delete everything within a given range
|
||||
*
|
||||
* Range is a half open interval - [start, end)
|
||||
*/
|
||||
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
|
||||
struct bpos start,
|
||||
struct bpos end,
|
||||
struct bversion version,
|
||||
struct disk_reservation *disk_res,
|
||||
struct extent_insert_hook *hook,
|
||||
u64 *journal_seq)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_btree_iter_init(&iter, c, id, start,
|
||||
BTREE_ITER_INTENT);
|
||||
|
||||
while ((k = bch2_btree_iter_peek(&iter)).k &&
|
||||
!(ret = btree_iter_err(k))) {
|
||||
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
|
||||
/* really shouldn't be using a bare, unpadded bkey_i */
|
||||
struct bkey_i delete;
|
||||
|
||||
if (bkey_cmp(iter.pos, end) >= 0)
|
||||
break;
|
||||
|
||||
bkey_init(&delete.k);
|
||||
|
||||
/*
|
||||
* For extents, iter.pos won't necessarily be the same as
|
||||
* bkey_start_pos(k.k) (for non extents they always will be the
|
||||
* same). It's important that we delete starting from iter.pos
|
||||
* because the range we want to delete could start in the middle
|
||||
* of k.
|
||||
*
|
||||
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
|
||||
* bkey_start_pos(k.k)).
|
||||
*/
|
||||
delete.k.p = iter.pos;
|
||||
delete.k.version = version;
|
||||
|
||||
if (iter.flags & BTREE_ITER_IS_EXTENTS) {
|
||||
/* create the biggest key we can */
|
||||
bch2_key_resize(&delete.k, max_sectors);
|
||||
bch2_cut_back(end, &delete.k);
|
||||
}
|
||||
|
||||
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
|
||||
BTREE_INSERT_NOFAIL,
|
||||
BTREE_INSERT_ENTRY(&iter, &delete));
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
bch2_btree_iter_cond_resched(&iter);
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
return ret;
|
||||
}
|
975
fs/bcachefs/buckets.c
Normal file
975
fs/bcachefs/buckets.c
Normal file
@ -0,0 +1,975 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Code for manipulating bucket marks for garbage collection.
|
||||
*
|
||||
* Copyright 2014 Datera, Inc.
|
||||
*
|
||||
* Bucket states:
|
||||
* - free bucket: mark == 0
|
||||
* The bucket contains no data and will not be read
|
||||
*
|
||||
* - allocator bucket: owned_by_allocator == 1
|
||||
* The bucket is on a free list, or it is an open bucket
|
||||
*
|
||||
* - cached bucket: owned_by_allocator == 0 &&
|
||||
* dirty_sectors == 0 &&
|
||||
* cached_sectors > 0
|
||||
* The bucket contains data but may be safely discarded as there are
|
||||
* enough replicas of the data on other cache devices, or it has been
|
||||
* written back to the backing device
|
||||
*
|
||||
* - dirty bucket: owned_by_allocator == 0 &&
|
||||
* dirty_sectors > 0
|
||||
* The bucket contains data that we must not discard (either only copy,
|
||||
* or one of the 'main copies' for data requiring multiple replicas)
|
||||
*
|
||||
* - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
|
||||
* This is a btree node, journal or gen/prio bucket
|
||||
*
|
||||
* Lifecycle:
|
||||
*
|
||||
* bucket invalidated => bucket on freelist => open bucket =>
|
||||
* [dirty bucket =>] cached bucket => bucket invalidated => ...
|
||||
*
|
||||
* Note that cache promotion can skip the dirty bucket step, as data
|
||||
* is copied from a deeper tier to a shallower tier, onto a cached
|
||||
* bucket.
|
||||
* Note also that a cached bucket can spontaneously become dirty --
|
||||
* see below.
|
||||
*
|
||||
* Only a traversal of the key space can determine whether a bucket is
|
||||
* truly dirty or cached.
|
||||
*
|
||||
* Transitions:
|
||||
*
|
||||
* - free => allocator: bucket was invalidated
|
||||
* - cached => allocator: bucket was invalidated
|
||||
*
|
||||
* - allocator => dirty: open bucket was filled up
|
||||
* - allocator => cached: open bucket was filled up
|
||||
* - allocator => metadata: metadata was allocated
|
||||
*
|
||||
* - dirty => cached: dirty sectors were copied to a deeper tier
|
||||
* - dirty => free: dirty sectors were overwritten or moved (copy gc)
|
||||
* - cached => free: cached sectors were overwritten
|
||||
*
|
||||
* - metadata => free: metadata was freed
|
||||
*
|
||||
* Oddities:
|
||||
* - cached => dirty: a device was removed so formerly replicated data
|
||||
* is no longer sufficiently replicated
|
||||
* - free => cached: cannot happen
|
||||
* - free => dirty: cannot happen
|
||||
* - free => metadata: cannot happen
|
||||
*/
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc.h"
|
||||
#include "btree_gc.h"
|
||||
#include "buckets.h"
|
||||
#include "error.h"
|
||||
#include "movinggc.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/preempt.h>
|
||||
|
||||
#ifdef DEBUG_BUCKETS
|
||||
|
||||
#define lg_local_lock lg_global_lock
|
||||
#define lg_local_unlock lg_global_unlock
|
||||
|
||||
static void bch2_fs_stats_verify(struct bch_fs *c)
|
||||
{
|
||||
struct bch_fs_usage stats =
|
||||
__bch2_fs_usage_read(c);
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
|
||||
if ((s64) stats.s[i].data[S_META] < 0)
|
||||
panic("replicas %u meta underflow: %lli\n",
|
||||
i + 1, stats.s[i].data[S_META]);
|
||||
|
||||
if ((s64) stats.s[i].data[S_DIRTY] < 0)
|
||||
panic("replicas %u dirty underflow: %lli\n",
|
||||
i + 1, stats.s[i].data[S_DIRTY]);
|
||||
|
||||
if ((s64) stats.s[i].persistent_reserved < 0)
|
||||
panic("replicas %u reserved underflow: %lli\n",
|
||||
i + 1, stats.s[i].persistent_reserved);
|
||||
}
|
||||
|
||||
if ((s64) stats.online_reserved < 0)
|
||||
panic("sectors_online_reserved underflow: %lli\n",
|
||||
stats.online_reserved);
|
||||
}
|
||||
|
||||
static void bch2_dev_stats_verify(struct bch_dev *ca)
|
||||
{
|
||||
struct bch_dev_usage stats =
|
||||
__bch2_dev_usage_read(ca);
|
||||
u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
|
||||
BUG_ON(stats.buckets[i] > n);
|
||||
BUG_ON(stats.buckets_alloc > n);
|
||||
BUG_ON(stats.buckets_unavailable > n);
|
||||
}
|
||||
|
||||
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
|
||||
{
|
||||
if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
|
||||
u64 used = __bch2_fs_sectors_used(c);
|
||||
u64 cached = 0;
|
||||
u64 avail = atomic64_read(&c->sectors_available);
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
|
||||
|
||||
if (used + avail + cached > c->capacity)
|
||||
panic("used %llu avail %llu cached %llu capacity %llu\n",
|
||||
used, avail, cached, c->capacity);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void bch2_fs_stats_verify(struct bch_fs *c) {}
|
||||
static void bch2_dev_stats_verify(struct bch_dev *ca) {}
|
||||
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Clear journal_seq_valid for buckets for which it's not needed, to prevent
|
||||
* wraparound:
|
||||
*/
|
||||
void bch2_bucket_seq_cleanup(struct bch_fs *c)
|
||||
{
|
||||
u16 last_seq_ondisk = c->journal.last_seq_ondisk;
|
||||
struct bch_dev *ca;
|
||||
struct bucket_array *buckets;
|
||||
struct bucket *g;
|
||||
struct bucket_mark m;
|
||||
unsigned i;
|
||||
|
||||
for_each_member_device(ca, c, i) {
|
||||
down_read(&ca->bucket_lock);
|
||||
buckets = bucket_array(ca);
|
||||
|
||||
for_each_bucket(g, buckets) {
|
||||
bucket_cmpxchg(g, m, ({
|
||||
if (!m.journal_seq_valid ||
|
||||
bucket_needs_journal_commit(m, last_seq_ondisk))
|
||||
break;
|
||||
|
||||
m.journal_seq_valid = 0;
|
||||
}));
|
||||
}
|
||||
up_read(&ca->bucket_lock);
|
||||
}
|
||||
}
|
||||
|
||||
#define bch2_usage_add(_acc, _stats) \
|
||||
do { \
|
||||
typeof(_acc) _a = (_acc), _s = (_stats); \
|
||||
unsigned i; \
|
||||
\
|
||||
for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \
|
||||
((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \
|
||||
} while (0)
|
||||
|
||||
#define bch2_usage_read_raw(_stats) \
|
||||
({ \
|
||||
typeof(*this_cpu_ptr(_stats)) _acc; \
|
||||
int cpu; \
|
||||
\
|
||||
memset(&_acc, 0, sizeof(_acc)); \
|
||||
\
|
||||
for_each_possible_cpu(cpu) \
|
||||
bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
|
||||
\
|
||||
_acc; \
|
||||
})
|
||||
|
||||
#define bch2_usage_read_cached(_c, _cached, _uncached) \
|
||||
({ \
|
||||
typeof(_cached) _ret; \
|
||||
unsigned _seq; \
|
||||
\
|
||||
do { \
|
||||
_seq = read_seqcount_begin(&(_c)->gc_pos_lock); \
|
||||
_ret = (_c)->gc_pos.phase == GC_PHASE_DONE \
|
||||
? bch2_usage_read_raw(_uncached) \
|
||||
: (_cached); \
|
||||
} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \
|
||||
\
|
||||
_ret; \
|
||||
})
|
||||
|
||||
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
|
||||
{
|
||||
return bch2_usage_read_raw(ca->usage_percpu);
|
||||
}
|
||||
|
||||
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
|
||||
}
|
||||
|
||||
struct bch_fs_usage
|
||||
__bch2_fs_usage_read(struct bch_fs *c)
|
||||
{
|
||||
return bch2_usage_read_raw(c->usage_percpu);
|
||||
}
|
||||
|
||||
struct bch_fs_usage
|
||||
bch2_fs_usage_read(struct bch_fs *c)
|
||||
{
|
||||
return bch2_usage_read_cached(c,
|
||||
c->usage_cached,
|
||||
c->usage_percpu);
|
||||
}
|
||||
|
||||
struct fs_usage_sum {
|
||||
u64 data;
|
||||
u64 reserved;
|
||||
};
|
||||
|
||||
static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
|
||||
{
|
||||
struct fs_usage_sum sum = { 0 };
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
|
||||
sum.data += (stats.s[i].data[S_META] +
|
||||
stats.s[i].data[S_DIRTY]) * (i + 1);
|
||||
sum.reserved += stats.s[i].persistent_reserved * (i + 1);
|
||||
}
|
||||
|
||||
sum.reserved += stats.online_reserved;
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define RESERVE_FACTOR 6
|
||||
|
||||
static u64 reserve_factor(u64 r)
|
||||
{
|
||||
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
|
||||
}
|
||||
|
||||
static u64 avail_factor(u64 r)
|
||||
{
|
||||
return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
|
||||
}
|
||||
|
||||
u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
|
||||
{
|
||||
struct fs_usage_sum sum = __fs_usage_sum(stats);
|
||||
|
||||
return sum.data + reserve_factor(sum.reserved);
|
||||
}
|
||||
|
||||
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
|
||||
{
|
||||
return min(c->capacity, __bch2_fs_sectors_used(c, stats));
|
||||
}
|
||||
|
||||
u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
|
||||
{
|
||||
return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
|
||||
}
|
||||
|
||||
static inline int is_unavailable_bucket(struct bucket_mark m)
|
||||
{
|
||||
return !is_available_bucket(m);
|
||||
}
|
||||
|
||||
static inline int is_fragmented_bucket(struct bucket_mark m,
|
||||
struct bch_dev *ca)
|
||||
{
|
||||
if (!m.owned_by_allocator &&
|
||||
m.data_type == BCH_DATA_USER &&
|
||||
bucket_sectors_used(m))
|
||||
return max_t(int, 0, (int) ca->mi.bucket_size -
|
||||
bucket_sectors_used(m));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline enum bch_data_type bucket_type(struct bucket_mark m)
|
||||
{
|
||||
return m.cached_sectors && !m.dirty_sectors
|
||||
? BCH_DATA_CACHED
|
||||
: m.data_type;
|
||||
}
|
||||
|
||||
static bool bucket_became_unavailable(struct bch_fs *c,
|
||||
struct bucket_mark old,
|
||||
struct bucket_mark new)
|
||||
{
|
||||
return is_available_bucket(old) &&
|
||||
!is_available_bucket(new) &&
|
||||
(!c || c->gc_pos.phase == GC_PHASE_DONE);
|
||||
}
|
||||
|
||||
void bch2_fs_usage_apply(struct bch_fs *c,
|
||||
struct bch_fs_usage *stats,
|
||||
struct disk_reservation *disk_res,
|
||||
struct gc_pos gc_pos)
|
||||
{
|
||||
struct fs_usage_sum sum = __fs_usage_sum(*stats);
|
||||
s64 added = sum.data + sum.reserved;
|
||||
|
||||
/*
|
||||
* Not allowed to reduce sectors_available except by getting a
|
||||
* reservation:
|
||||
*/
|
||||
BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
|
||||
|
||||
if (added > 0) {
|
||||
disk_res->sectors -= added;
|
||||
stats->online_reserved -= added;
|
||||
}
|
||||
|
||||
percpu_down_read(&c->usage_lock);
|
||||
preempt_disable();
|
||||
/* online_reserved not subject to gc: */
|
||||
this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
|
||||
stats->online_reserved = 0;
|
||||
|
||||
if (!gc_will_visit(c, gc_pos))
|
||||
bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
|
||||
|
||||
bch2_fs_stats_verify(c);
|
||||
preempt_enable();
|
||||
percpu_up_read(&c->usage_lock);
|
||||
|
||||
memset(stats, 0, sizeof(*stats));
|
||||
}
|
||||
|
||||
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bucket_mark old, struct bucket_mark new)
|
||||
{
|
||||
struct bch_dev_usage *dev_usage;
|
||||
|
||||
if (c)
|
||||
percpu_rwsem_assert_held(&c->usage_lock);
|
||||
|
||||
if (old.data_type && new.data_type &&
|
||||
old.data_type != new.data_type) {
|
||||
BUG_ON(!c);
|
||||
bch2_fs_inconsistent(c,
|
||||
"different types of data in same bucket: %s, %s",
|
||||
bch2_data_types[old.data_type],
|
||||
bch2_data_types[new.data_type]);
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
dev_usage = this_cpu_ptr(ca->usage_percpu);
|
||||
|
||||
dev_usage->buckets[bucket_type(old)]--;
|
||||
dev_usage->buckets[bucket_type(new)]++;
|
||||
|
||||
dev_usage->buckets_alloc +=
|
||||
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
|
||||
dev_usage->buckets_unavailable +=
|
||||
is_unavailable_bucket(new) - is_unavailable_bucket(old);
|
||||
|
||||
dev_usage->sectors[old.data_type] -= old.dirty_sectors;
|
||||
dev_usage->sectors[new.data_type] += new.dirty_sectors;
|
||||
dev_usage->sectors[BCH_DATA_CACHED] +=
|
||||
(int) new.cached_sectors - (int) old.cached_sectors;
|
||||
dev_usage->sectors_fragmented +=
|
||||
is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
|
||||
preempt_enable();
|
||||
|
||||
if (!is_available_bucket(old) && is_available_bucket(new))
|
||||
bch2_wake_allocator(ca);
|
||||
|
||||
bch2_dev_stats_verify(ca);
|
||||
}
|
||||
|
||||
#define bucket_data_cmpxchg(c, ca, g, new, expr) \
|
||||
({ \
|
||||
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
|
||||
\
|
||||
bch2_dev_usage_update(c, ca, _old, new); \
|
||||
_old; \
|
||||
})
|
||||
|
||||
bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
size_t b, struct bucket_mark *old)
|
||||
{
|
||||
struct bucket *g;
|
||||
struct bucket_mark new;
|
||||
|
||||
percpu_rwsem_assert_held(&c->usage_lock);
|
||||
|
||||
g = bucket(ca, b);
|
||||
|
||||
*old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
if (!is_available_bucket(new))
|
||||
return false;
|
||||
|
||||
new.owned_by_allocator = 1;
|
||||
new.data_type = 0;
|
||||
new.cached_sectors = 0;
|
||||
new.dirty_sectors = 0;
|
||||
new.gen++;
|
||||
}));
|
||||
|
||||
if (!old->owned_by_allocator && old->cached_sectors)
|
||||
trace_invalidate(ca, bucket_to_sector(ca, b),
|
||||
old->cached_sectors);
|
||||
return true;
|
||||
}
|
||||
|
||||
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
size_t b, bool owned_by_allocator,
|
||||
struct gc_pos pos, unsigned flags)
|
||||
{
|
||||
struct bucket *g;
|
||||
struct bucket_mark old, new;
|
||||
|
||||
percpu_rwsem_assert_held(&c->usage_lock);
|
||||
g = bucket(ca, b);
|
||||
|
||||
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
|
||||
gc_will_visit(c, pos))
|
||||
return;
|
||||
|
||||
old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
new.owned_by_allocator = owned_by_allocator;
|
||||
}));
|
||||
|
||||
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
|
||||
c->gc_pos.phase == GC_PHASE_DONE);
|
||||
}
|
||||
|
||||
#define saturated_add(ca, dst, src, max) \
|
||||
do { \
|
||||
BUG_ON((int) (dst) + (src) < 0); \
|
||||
if ((dst) == (max)) \
|
||||
; \
|
||||
else if ((dst) + (src) <= (max)) \
|
||||
dst += (src); \
|
||||
else { \
|
||||
dst = (max); \
|
||||
trace_sectors_saturated(ca); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
size_t b, enum bch_data_type type,
|
||||
unsigned sectors, struct gc_pos pos,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bucket *g;
|
||||
struct bucket_mark old, new;
|
||||
|
||||
BUG_ON(!type);
|
||||
|
||||
if (likely(c)) {
|
||||
percpu_rwsem_assert_held(&c->usage_lock);
|
||||
|
||||
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
|
||||
gc_will_visit(c, pos))
|
||||
return;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
g = bucket(ca, b);
|
||||
old = bucket_data_cmpxchg(c, ca, g, new, ({
|
||||
saturated_add(ca, new.dirty_sectors, sectors,
|
||||
GC_MAX_SECTORS_USED);
|
||||
new.data_type = type;
|
||||
}));
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
|
||||
bucket_became_unavailable(c, old, new));
|
||||
}
|
||||
|
||||
/* Reverting this until the copygc + compression issue is fixed: */
|
||||
|
||||
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
|
||||
{
|
||||
if (!sectors)
|
||||
return 0;
|
||||
|
||||
return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
|
||||
crc.uncompressed_size));
|
||||
}
|
||||
|
||||
/*
|
||||
* Checking against gc's position has to be done here, inside the cmpxchg()
|
||||
* loop, to avoid racing with the start of gc clearing all the marks - GC does
|
||||
* that with the gc pos seqlock held.
|
||||
*/
|
||||
static void bch2_mark_pointer(struct bch_fs *c,
|
||||
struct bkey_s_c_extent e,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
struct bch_extent_crc_unpacked crc,
|
||||
s64 sectors, enum s_alloc type,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
{
|
||||
struct bucket_mark old, new;
|
||||
unsigned saturated;
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
struct bucket *g = PTR_BUCKET(ca, ptr);
|
||||
enum bch_data_type data_type = type == S_META
|
||||
? BCH_DATA_BTREE : BCH_DATA_USER;
|
||||
u64 v;
|
||||
|
||||
if (crc.compression_type) {
|
||||
unsigned old_sectors, new_sectors;
|
||||
|
||||
if (sectors > 0) {
|
||||
old_sectors = 0;
|
||||
new_sectors = sectors;
|
||||
} else {
|
||||
old_sectors = e.k->size;
|
||||
new_sectors = e.k->size + sectors;
|
||||
}
|
||||
|
||||
sectors = -__disk_sectors(crc, old_sectors)
|
||||
+__disk_sectors(crc, new_sectors);
|
||||
}
|
||||
|
||||
if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
|
||||
if (journal_seq)
|
||||
bucket_cmpxchg(g, new, ({
|
||||
new.journal_seq_valid = 1;
|
||||
new.journal_seq = journal_seq;
|
||||
}));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
v = atomic64_read(&g->_mark.v);
|
||||
do {
|
||||
new.v.counter = old.v.counter = v;
|
||||
saturated = 0;
|
||||
|
||||
/*
|
||||
* Check this after reading bucket mark to guard against
|
||||
* the allocator invalidating a bucket after we've already
|
||||
* checked the gen
|
||||
*/
|
||||
if (gen_after(new.gen, ptr->gen)) {
|
||||
BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
|
||||
EBUG_ON(!ptr->cached &&
|
||||
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
|
||||
return;
|
||||
}
|
||||
|
||||
if (!ptr->cached &&
|
||||
new.dirty_sectors == GC_MAX_SECTORS_USED &&
|
||||
sectors < 0)
|
||||
saturated = -sectors;
|
||||
|
||||
if (ptr->cached)
|
||||
saturated_add(ca, new.cached_sectors, sectors,
|
||||
GC_MAX_SECTORS_USED);
|
||||
else
|
||||
saturated_add(ca, new.dirty_sectors, sectors,
|
||||
GC_MAX_SECTORS_USED);
|
||||
|
||||
if (!new.dirty_sectors &&
|
||||
!new.cached_sectors) {
|
||||
new.data_type = 0;
|
||||
|
||||
if (journal_seq) {
|
||||
new.journal_seq_valid = 1;
|
||||
new.journal_seq = journal_seq;
|
||||
}
|
||||
} else {
|
||||
new.data_type = data_type;
|
||||
}
|
||||
|
||||
if (flags & BCH_BUCKET_MARK_NOATOMIC) {
|
||||
g->_mark = new;
|
||||
break;
|
||||
}
|
||||
} while ((v = atomic64_cmpxchg(&g->_mark.v,
|
||||
old.v.counter,
|
||||
new.v.counter)) != old.v.counter);
|
||||
|
||||
bch2_dev_usage_update(c, ca, old, new);
|
||||
|
||||
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
|
||||
bucket_became_unavailable(c, old, new));
|
||||
|
||||
if (saturated &&
|
||||
atomic_long_add_return(saturated,
|
||||
&ca->saturated_count) >=
|
||||
bucket_to_sector(ca, ca->free_inc.size)) {
|
||||
if (c->gc_thread) {
|
||||
trace_gc_sectors_saturated(c);
|
||||
wake_up_process(c->gc_thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
|
||||
s64 sectors, bool metadata,
|
||||
struct gc_pos pos,
|
||||
struct bch_fs_usage *stats,
|
||||
u64 journal_seq, unsigned flags)
|
||||
{
|
||||
/*
|
||||
* synchronization w.r.t. GC:
|
||||
*
|
||||
* Normally, bucket sector counts/marks are updated on the fly, as
|
||||
* references are added/removed from the btree, the lists of buckets the
|
||||
* allocator owns, other metadata buckets, etc.
|
||||
*
|
||||
* When GC is in progress and going to mark this reference, we do _not_
|
||||
* mark this reference here, to avoid double counting - GC will count it
|
||||
* when it gets to it.
|
||||
*
|
||||
* To know whether we should mark a given reference (GC either isn't
|
||||
* running, or has already marked references at this position) we
|
||||
* construct a total order for everything GC walks. Then, we can simply
|
||||
* compare the position of the reference we're marking - @pos - with
|
||||
* GC's current position. If GC is going to mark this reference, GC's
|
||||
* current position will be less than @pos; if GC's current position is
|
||||
* greater than @pos GC has either already walked this position, or
|
||||
* isn't running.
|
||||
*
|
||||
* To avoid racing with GC's position changing, we have to deal with
|
||||
* - GC's position being set to GC_POS_MIN when GC starts:
|
||||
* usage_lock guards against this
|
||||
* - GC's position overtaking @pos: we guard against this with
|
||||
* whatever lock protects the data structure the reference lives in
|
||||
* (e.g. the btree node lock, or the relevant allocator lock).
|
||||
*/
|
||||
|
||||
percpu_down_read(&c->usage_lock);
|
||||
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
|
||||
gc_will_visit(c, pos))
|
||||
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
|
||||
|
||||
if (!stats)
|
||||
stats = this_cpu_ptr(c->usage_percpu);
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED: {
|
||||
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
enum s_alloc type = metadata ? S_META : S_DIRTY;
|
||||
unsigned replicas = 0;
|
||||
|
||||
BUG_ON(metadata && bkey_extent_is_cached(e.k));
|
||||
BUG_ON(!sectors);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc) {
|
||||
bch2_mark_pointer(c, e, ptr, crc, sectors, type,
|
||||
stats, journal_seq, flags);
|
||||
replicas += !ptr->cached;
|
||||
}
|
||||
|
||||
if (replicas) {
|
||||
BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
|
||||
stats->s[replicas - 1].data[type] += sectors;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case BCH_RESERVATION: {
|
||||
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
|
||||
|
||||
if (r.v->nr_replicas) {
|
||||
BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
|
||||
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
percpu_up_read(&c->usage_lock);
|
||||
}
|
||||
|
||||
/* Disk reservations: */
|
||||
|
||||
static u64 __recalc_sectors_available(struct bch_fs *c)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
|
||||
|
||||
return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
|
||||
}
|
||||
|
||||
/* Used by gc when it's starting: */
|
||||
void bch2_recalc_sectors_available(struct bch_fs *c)
|
||||
{
|
||||
percpu_down_write(&c->usage_lock);
|
||||
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
|
||||
percpu_up_write(&c->usage_lock);
|
||||
}
|
||||
|
||||
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
|
||||
{
|
||||
percpu_down_read(&c->usage_lock);
|
||||
this_cpu_sub(c->usage_percpu->online_reserved,
|
||||
res->sectors);
|
||||
|
||||
bch2_fs_stats_verify(c);
|
||||
percpu_up_read(&c->usage_lock);
|
||||
|
||||
res->sectors = 0;
|
||||
}
|
||||
|
||||
#define SECTORS_CACHE 1024
|
||||
|
||||
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
|
||||
unsigned sectors, int flags)
|
||||
{
|
||||
struct bch_fs_usage *stats;
|
||||
u64 old, v, get;
|
||||
s64 sectors_available;
|
||||
int ret;
|
||||
|
||||
percpu_down_read(&c->usage_lock);
|
||||
preempt_disable();
|
||||
stats = this_cpu_ptr(c->usage_percpu);
|
||||
|
||||
if (sectors <= stats->available_cache)
|
||||
goto out;
|
||||
|
||||
v = atomic64_read(&c->sectors_available);
|
||||
do {
|
||||
old = v;
|
||||
get = min((u64) sectors + SECTORS_CACHE, old);
|
||||
|
||||
if (get < sectors) {
|
||||
preempt_enable();
|
||||
percpu_up_read(&c->usage_lock);
|
||||
goto recalculate;
|
||||
}
|
||||
} while ((v = atomic64_cmpxchg(&c->sectors_available,
|
||||
old, old - get)) != old);
|
||||
|
||||
stats->available_cache += get;
|
||||
|
||||
out:
|
||||
stats->available_cache -= sectors;
|
||||
stats->online_reserved += sectors;
|
||||
res->sectors += sectors;
|
||||
|
||||
bch2_disk_reservations_verify(c, flags);
|
||||
bch2_fs_stats_verify(c);
|
||||
preempt_enable();
|
||||
percpu_up_read(&c->usage_lock);
|
||||
return 0;
|
||||
|
||||
recalculate:
|
||||
/*
|
||||
* GC recalculates sectors_available when it starts, so that hopefully
|
||||
* we don't normally end up blocking here:
|
||||
*/
|
||||
|
||||
/*
|
||||
* Piss fuck, we can be called from extent_insert_fixup() with btree
|
||||
* locks held:
|
||||
*/
|
||||
|
||||
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
|
||||
if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
|
||||
down_read(&c->gc_lock);
|
||||
else if (!down_read_trylock(&c->gc_lock))
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
percpu_down_write(&c->usage_lock);
|
||||
sectors_available = __recalc_sectors_available(c);
|
||||
|
||||
if (sectors <= sectors_available ||
|
||||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
|
||||
atomic64_set(&c->sectors_available,
|
||||
max_t(s64, 0, sectors_available - sectors));
|
||||
stats->online_reserved += sectors;
|
||||
res->sectors += sectors;
|
||||
ret = 0;
|
||||
|
||||
bch2_disk_reservations_verify(c, flags);
|
||||
} else {
|
||||
atomic64_set(&c->sectors_available, sectors_available);
|
||||
ret = -ENOSPC;
|
||||
}
|
||||
|
||||
bch2_fs_stats_verify(c);
|
||||
percpu_up_write(&c->usage_lock);
|
||||
|
||||
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
|
||||
up_read(&c->gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Startup/shutdown: */
|
||||
|
||||
static void buckets_free_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
struct bucket_array *buckets =
|
||||
container_of(rcu, struct bucket_array, rcu);
|
||||
|
||||
kvpfree(buckets,
|
||||
sizeof(struct bucket_array) +
|
||||
buckets->nbuckets * sizeof(struct bucket));
|
||||
}
|
||||
|
||||
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
||||
{
|
||||
struct bucket_array *buckets = NULL, *old_buckets = NULL;
|
||||
unsigned long *buckets_dirty = NULL;
|
||||
u8 *oldest_gens = NULL;
|
||||
alloc_fifo free[RESERVE_NR];
|
||||
alloc_fifo free_inc;
|
||||
alloc_heap alloc_heap;
|
||||
copygc_heap copygc_heap;
|
||||
|
||||
size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
|
||||
ca->mi.bucket_size / c->opts.btree_node_size);
|
||||
/* XXX: these should be tunable */
|
||||
size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
|
||||
size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
|
||||
size_t free_inc_reserve = copygc_reserve / 2;
|
||||
bool resize = ca->buckets != NULL,
|
||||
start_copygc = ca->copygc_thread != NULL;
|
||||
int ret = -ENOMEM;
|
||||
unsigned i;
|
||||
|
||||
memset(&free, 0, sizeof(free));
|
||||
memset(&free_inc, 0, sizeof(free_inc));
|
||||
memset(&alloc_heap, 0, sizeof(alloc_heap));
|
||||
memset(©gc_heap, 0, sizeof(copygc_heap));
|
||||
|
||||
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
|
||||
nbuckets * sizeof(struct bucket),
|
||||
GFP_KERNEL|__GFP_ZERO)) ||
|
||||
!(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
|
||||
GFP_KERNEL|__GFP_ZERO)) ||
|
||||
!(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) *
|
||||
sizeof(unsigned long),
|
||||
GFP_KERNEL|__GFP_ZERO)) ||
|
||||
!init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
|
||||
!init_fifo(&free[RESERVE_MOVINGGC],
|
||||
copygc_reserve, GFP_KERNEL) ||
|
||||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
|
||||
!init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) ||
|
||||
!init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) ||
|
||||
!init_heap(©gc_heap, copygc_reserve, GFP_KERNEL))
|
||||
goto err;
|
||||
|
||||
buckets->first_bucket = ca->mi.first_bucket;
|
||||
buckets->nbuckets = nbuckets;
|
||||
|
||||
bch2_copygc_stop(ca);
|
||||
|
||||
if (resize) {
|
||||
down_write(&c->gc_lock);
|
||||
down_write(&ca->bucket_lock);
|
||||
percpu_down_write(&c->usage_lock);
|
||||
}
|
||||
|
||||
old_buckets = bucket_array(ca);
|
||||
|
||||
if (resize) {
|
||||
size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
|
||||
|
||||
memcpy(buckets->b,
|
||||
old_buckets->b,
|
||||
n * sizeof(struct bucket));
|
||||
memcpy(oldest_gens,
|
||||
ca->oldest_gens,
|
||||
n * sizeof(u8));
|
||||
memcpy(buckets_dirty,
|
||||
ca->buckets_dirty,
|
||||
BITS_TO_LONGS(n) * sizeof(unsigned long));
|
||||
}
|
||||
|
||||
rcu_assign_pointer(ca->buckets, buckets);
|
||||
buckets = old_buckets;
|
||||
|
||||
swap(ca->oldest_gens, oldest_gens);
|
||||
swap(ca->buckets_dirty, buckets_dirty);
|
||||
|
||||
if (resize)
|
||||
percpu_up_write(&c->usage_lock);
|
||||
|
||||
spin_lock(&c->freelist_lock);
|
||||
for (i = 0; i < RESERVE_NR; i++) {
|
||||
fifo_move(&free[i], &ca->free[i]);
|
||||
swap(ca->free[i], free[i]);
|
||||
}
|
||||
fifo_move(&free_inc, &ca->free_inc);
|
||||
swap(ca->free_inc, free_inc);
|
||||
spin_unlock(&c->freelist_lock);
|
||||
|
||||
/* with gc lock held, alloc_heap can't be in use: */
|
||||
swap(ca->alloc_heap, alloc_heap);
|
||||
|
||||
/* and we shut down copygc: */
|
||||
swap(ca->copygc_heap, copygc_heap);
|
||||
|
||||
nbuckets = ca->mi.nbuckets;
|
||||
|
||||
if (resize) {
|
||||
up_write(&ca->bucket_lock);
|
||||
up_write(&c->gc_lock);
|
||||
}
|
||||
|
||||
if (start_copygc &&
|
||||
bch2_copygc_start(c, ca))
|
||||
bch_err(ca, "error restarting copygc thread");
|
||||
|
||||
ret = 0;
|
||||
err:
|
||||
free_heap(©gc_heap);
|
||||
free_heap(&alloc_heap);
|
||||
free_fifo(&free_inc);
|
||||
for (i = 0; i < RESERVE_NR; i++)
|
||||
free_fifo(&free[i]);
|
||||
kvpfree(buckets_dirty,
|
||||
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
|
||||
kvpfree(oldest_gens,
|
||||
nbuckets * sizeof(u8));
|
||||
if (buckets)
|
||||
call_rcu(&old_buckets->rcu, buckets_free_rcu);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_dev_buckets_free(struct bch_dev *ca)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
free_heap(&ca->copygc_heap);
|
||||
free_heap(&ca->alloc_heap);
|
||||
free_fifo(&ca->free_inc);
|
||||
for (i = 0; i < RESERVE_NR; i++)
|
||||
free_fifo(&ca->free[i]);
|
||||
kvpfree(ca->buckets_dirty,
|
||||
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
|
||||
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
|
||||
kvpfree(rcu_dereference_protected(ca->buckets, 1),
|
||||
sizeof(struct bucket_array) +
|
||||
ca->mi.nbuckets * sizeof(struct bucket));
|
||||
|
||||
free_percpu(ca->usage_percpu);
|
||||
}
|
||||
|
||||
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
|
||||
return -ENOMEM;
|
||||
|
||||
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
|
||||
}
|
276
fs/bcachefs/buckets.h
Normal file
276
fs/bcachefs/buckets.h
Normal file
@ -0,0 +1,276 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Code for manipulating bucket marks for garbage collection.
|
||||
*
|
||||
* Copyright 2014 Datera, Inc.
|
||||
*/
|
||||
|
||||
#ifndef _BUCKETS_H
|
||||
#define _BUCKETS_H
|
||||
|
||||
#include "buckets_types.h"
|
||||
#include "super.h"
|
||||
|
||||
#define for_each_bucket(_b, _buckets) \
|
||||
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
|
||||
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
|
||||
|
||||
#define bucket_cmpxchg(g, new, expr) \
|
||||
({ \
|
||||
u64 _v = atomic64_read(&(g)->_mark.v); \
|
||||
struct bucket_mark _old; \
|
||||
\
|
||||
do { \
|
||||
(new).v.counter = _old.v.counter = _v; \
|
||||
expr; \
|
||||
} while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
|
||||
_old.v.counter, \
|
||||
(new).v.counter)) != _old.v.counter);\
|
||||
_old; \
|
||||
})
|
||||
|
||||
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
|
||||
{
|
||||
return rcu_dereference_check(ca->buckets,
|
||||
!ca->fs ||
|
||||
percpu_rwsem_is_held(&ca->fs->usage_lock) ||
|
||||
lockdep_is_held(&ca->fs->gc_lock) ||
|
||||
lockdep_is_held(&ca->bucket_lock));
|
||||
}
|
||||
|
||||
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
|
||||
{
|
||||
struct bucket_array *buckets = bucket_array(ca);
|
||||
|
||||
BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
|
||||
return buckets->b + b;
|
||||
}
|
||||
|
||||
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
|
||||
size_t b, int rw)
|
||||
{
|
||||
bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
|
||||
}
|
||||
|
||||
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
|
||||
{
|
||||
return c->bucket_clock[rw].hand - g->io_time[rw];
|
||||
}
|
||||
|
||||
/*
|
||||
* bucket_gc_gen() returns the difference between the bucket's current gen and
|
||||
* the oldest gen of any pointer into that bucket in the btree.
|
||||
*/
|
||||
|
||||
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
|
||||
{
|
||||
return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
|
||||
}
|
||||
|
||||
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
|
||||
const struct bch_extent_ptr *ptr)
|
||||
{
|
||||
return sector_to_bucket(ca, ptr->offset);
|
||||
}
|
||||
|
||||
static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
|
||||
const struct bch_extent_ptr *ptr)
|
||||
{
|
||||
return bucket(ca, PTR_BUCKET_NR(ca, ptr));
|
||||
}
|
||||
|
||||
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
|
||||
const struct bch_extent_ptr *ptr)
|
||||
{
|
||||
struct bucket_mark m;
|
||||
|
||||
rcu_read_lock();
|
||||
m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
|
||||
rcu_read_unlock();
|
||||
|
||||
return m;
|
||||
}
|
||||
|
||||
static inline int gen_cmp(u8 a, u8 b)
|
||||
{
|
||||
return (s8) (a - b);
|
||||
}
|
||||
|
||||
static inline int gen_after(u8 a, u8 b)
|
||||
{
|
||||
int r = gen_cmp(a, b);
|
||||
|
||||
return r > 0 ? r : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* ptr_stale() - check if a pointer points into a bucket that has been
|
||||
* invalidated.
|
||||
*/
|
||||
static inline u8 ptr_stale(struct bch_dev *ca,
|
||||
const struct bch_extent_ptr *ptr)
|
||||
{
|
||||
return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
|
||||
}
|
||||
|
||||
/* bucket gc marks */
|
||||
|
||||
/* The dirty and cached sector counts saturate. If this occurs,
|
||||
* reference counting alone will not free the bucket, and a btree
|
||||
* GC must be performed. */
|
||||
#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
|
||||
|
||||
static inline unsigned bucket_sectors_used(struct bucket_mark mark)
|
||||
{
|
||||
return mark.dirty_sectors + mark.cached_sectors;
|
||||
}
|
||||
|
||||
static inline bool bucket_unused(struct bucket_mark mark)
|
||||
{
|
||||
return !mark.owned_by_allocator &&
|
||||
!mark.data_type &&
|
||||
!bucket_sectors_used(mark);
|
||||
}
|
||||
|
||||
/* Device usage: */
|
||||
|
||||
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
|
||||
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
static inline u64 __dev_buckets_available(struct bch_dev *ca,
|
||||
struct bch_dev_usage stats)
|
||||
{
|
||||
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
|
||||
|
||||
if (WARN_ONCE(stats.buckets_unavailable > total,
|
||||
"buckets_unavailable overflow (%llu > %llu)\n",
|
||||
stats.buckets_unavailable, total))
|
||||
return 0;
|
||||
|
||||
return total - stats.buckets_unavailable;
|
||||
}
|
||||
|
||||
/*
|
||||
* Number of reclaimable buckets - only for use by the allocator thread:
|
||||
*/
|
||||
static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
|
||||
}
|
||||
|
||||
static inline u64 __dev_buckets_free(struct bch_dev *ca,
|
||||
struct bch_dev_usage stats)
|
||||
{
|
||||
return __dev_buckets_available(ca, stats) +
|
||||
fifo_used(&ca->free[RESERVE_NONE]) +
|
||||
fifo_used(&ca->free_inc);
|
||||
}
|
||||
|
||||
static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
|
||||
}
|
||||
|
||||
/* Filesystem usage: */
|
||||
|
||||
static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
|
||||
{
|
||||
switch (s) {
|
||||
case S_META:
|
||||
return BCH_DATA_BTREE;
|
||||
case S_DIRTY:
|
||||
return BCH_DATA_USER;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
|
||||
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
|
||||
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
|
||||
struct disk_reservation *, struct gc_pos);
|
||||
|
||||
u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
|
||||
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
|
||||
u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
|
||||
|
||||
static inline bool is_available_bucket(struct bucket_mark mark)
|
||||
{
|
||||
return (!mark.owned_by_allocator &&
|
||||
!mark.dirty_sectors &&
|
||||
!mark.nouse);
|
||||
}
|
||||
|
||||
static inline bool bucket_needs_journal_commit(struct bucket_mark m,
|
||||
u16 last_seq_ondisk)
|
||||
{
|
||||
return m.journal_seq_valid &&
|
||||
((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
|
||||
}
|
||||
|
||||
void bch2_bucket_seq_cleanup(struct bch_fs *);
|
||||
|
||||
bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
|
||||
size_t, struct bucket_mark *);
|
||||
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
|
||||
size_t, bool, struct gc_pos, unsigned);
|
||||
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
|
||||
size_t, enum bch_data_type, unsigned,
|
||||
struct gc_pos, unsigned);
|
||||
|
||||
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
|
||||
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
|
||||
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
|
||||
#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
|
||||
|
||||
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
|
||||
struct bch_fs_usage *, u64, unsigned);
|
||||
|
||||
void bch2_recalc_sectors_available(struct bch_fs *);
|
||||
|
||||
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
|
||||
|
||||
static inline void bch2_disk_reservation_put(struct bch_fs *c,
|
||||
struct disk_reservation *res)
|
||||
{
|
||||
if (res->sectors)
|
||||
__bch2_disk_reservation_put(c, res);
|
||||
}
|
||||
|
||||
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
|
||||
#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1)
|
||||
#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2)
|
||||
|
||||
int bch2_disk_reservation_add(struct bch_fs *,
|
||||
struct disk_reservation *,
|
||||
unsigned, int);
|
||||
|
||||
static inline struct disk_reservation
|
||||
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
|
||||
{
|
||||
return (struct disk_reservation) {
|
||||
.sectors = 0,
|
||||
#if 0
|
||||
/* not used yet: */
|
||||
.gen = c->capacity_gen,
|
||||
#endif
|
||||
.nr_replicas = nr_replicas,
|
||||
};
|
||||
}
|
||||
|
||||
static inline int bch2_disk_reservation_get(struct bch_fs *c,
|
||||
struct disk_reservation *res,
|
||||
unsigned sectors,
|
||||
unsigned nr_replicas,
|
||||
int flags)
|
||||
{
|
||||
*res = bch2_disk_reservation_init(c, nr_replicas);
|
||||
|
||||
return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
|
||||
}
|
||||
|
||||
int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
|
||||
void bch2_dev_buckets_free(struct bch_dev *);
|
||||
int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
|
||||
|
||||
#endif /* _BUCKETS_H */
|
96
fs/bcachefs/buckets_types.h
Normal file
96
fs/bcachefs/buckets_types.h
Normal file
@ -0,0 +1,96 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BUCKETS_TYPES_H
|
||||
#define _BUCKETS_TYPES_H
|
||||
|
||||
#include "util.h"
|
||||
|
||||
struct bucket_mark {
|
||||
union {
|
||||
struct {
|
||||
atomic64_t v;
|
||||
};
|
||||
|
||||
struct {
|
||||
u8 gen;
|
||||
u8 data_type:3,
|
||||
gen_valid:1,
|
||||
owned_by_allocator:1,
|
||||
nouse:1,
|
||||
journal_seq_valid:1;
|
||||
u16 dirty_sectors;
|
||||
u16 cached_sectors;
|
||||
|
||||
/*
|
||||
* low bits of journal sequence number when this bucket was most
|
||||
* recently modified: if journal_seq_valid is set, this bucket
|
||||
* can't be reused until the journal sequence number written to
|
||||
* disk is >= the bucket's journal sequence number:
|
||||
*/
|
||||
u16 journal_seq;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
struct bucket {
|
||||
union {
|
||||
struct bucket_mark _mark;
|
||||
const struct bucket_mark mark;
|
||||
};
|
||||
|
||||
u16 io_time[2];
|
||||
};
|
||||
|
||||
struct bucket_array {
|
||||
struct rcu_head rcu;
|
||||
u16 first_bucket;
|
||||
size_t nbuckets;
|
||||
struct bucket b[];
|
||||
};
|
||||
|
||||
struct bch_dev_usage {
|
||||
u64 buckets[BCH_DATA_NR];
|
||||
u64 buckets_alloc;
|
||||
u64 buckets_unavailable;
|
||||
|
||||
/* _compressed_ sectors: */
|
||||
u64 sectors[BCH_DATA_NR];
|
||||
u64 sectors_fragmented;
|
||||
};
|
||||
|
||||
/* kill, switch to bch_data_type? */
|
||||
enum s_alloc {
|
||||
S_META,
|
||||
S_DIRTY,
|
||||
S_ALLOC_NR,
|
||||
};
|
||||
|
||||
struct bch_fs_usage {
|
||||
/* all fields are in units of 512 byte sectors: */
|
||||
/* _uncompressed_ sectors: */
|
||||
u64 online_reserved;
|
||||
u64 available_cache;
|
||||
|
||||
struct {
|
||||
u64 data[S_ALLOC_NR];
|
||||
u64 persistent_reserved;
|
||||
} s[BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
/*
|
||||
* A reservation for space on disk:
|
||||
*/
|
||||
struct disk_reservation {
|
||||
u64 sectors;
|
||||
u32 gen;
|
||||
unsigned nr_replicas;
|
||||
};
|
||||
|
||||
struct copygc_heap_entry {
|
||||
u8 gen;
|
||||
u32 sectors;
|
||||
u64 offset;
|
||||
};
|
||||
|
||||
typedef HEAP(struct copygc_heap_entry) copygc_heap;
|
||||
|
||||
#endif /* _BUCKETS_TYPES_H */
|
663
fs/bcachefs/chardev.c
Normal file
663
fs/bcachefs/chardev.c
Normal file
@ -0,0 +1,663 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef NO_BCACHEFS_CHARDEV
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc.h"
|
||||
#include "bcachefs_ioctl.h"
|
||||
#include "buckets.h"
|
||||
#include "chardev.h"
|
||||
#include "move.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/anon_inodes.h>
|
||||
#include <linux/cdev.h>
|
||||
#include <linux/device.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/ioctl.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/major.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
||||
/* returns with ref on ca->ref */
|
||||
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
|
||||
if (flags & BCH_BY_INDEX) {
|
||||
if (dev >= c->sb.nr_devices)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
rcu_read_lock();
|
||||
ca = rcu_dereference(c->devs[dev]);
|
||||
if (ca)
|
||||
percpu_ref_get(&ca->ref);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!ca)
|
||||
return ERR_PTR(-EINVAL);
|
||||
} else {
|
||||
char *path;
|
||||
|
||||
path = strndup_user((const char __user *)
|
||||
(unsigned long) dev, PATH_MAX);
|
||||
if (IS_ERR(path))
|
||||
return ERR_CAST(path);
|
||||
|
||||
ca = bch2_dev_lookup(c, path);
|
||||
kfree(path);
|
||||
}
|
||||
|
||||
return ca;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_assemble arg;
|
||||
struct bch_fs *c;
|
||||
u64 *user_devs = NULL;
|
||||
char **devs = NULL;
|
||||
unsigned i;
|
||||
int ret = -EFAULT;
|
||||
|
||||
if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (arg.flags || arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
|
||||
if (!user_devs)
|
||||
return -ENOMEM;
|
||||
|
||||
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
|
||||
|
||||
if (copy_from_user(user_devs, user_arg->devs,
|
||||
sizeof(u64) * arg.nr_devs))
|
||||
goto err;
|
||||
|
||||
for (i = 0; i < arg.nr_devs; i++) {
|
||||
devs[i] = strndup_user((const char __user *)(unsigned long)
|
||||
user_devs[i],
|
||||
PATH_MAX);
|
||||
if (!devs[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
|
||||
ret = PTR_ERR_OR_ZERO(c);
|
||||
if (!ret)
|
||||
closure_put(&c->cl);
|
||||
err:
|
||||
if (devs)
|
||||
for (i = 0; i < arg.nr_devs; i++)
|
||||
kfree(devs[i]);
|
||||
kfree(devs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_incremental arg;
|
||||
const char *err;
|
||||
char *path;
|
||||
|
||||
if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (arg.flags || arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
err = bch2_fs_open_incremental(path);
|
||||
kfree(path);
|
||||
|
||||
if (err) {
|
||||
pr_err("Could not register bcachefs devices: %s", err);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
#if 0
|
||||
case BCH_IOCTL_ASSEMBLE:
|
||||
return bch2_ioctl_assemble(arg);
|
||||
case BCH_IOCTL_INCREMENTAL:
|
||||
return bch2_ioctl_incremental(arg);
|
||||
#endif
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
}
|
||||
|
||||
static long bch2_ioctl_query_uuid(struct bch_fs *c,
|
||||
struct bch_ioctl_query_uuid __user *user_arg)
|
||||
{
|
||||
return copy_to_user(&user_arg->uuid,
|
||||
&c->sb.user_uuid,
|
||||
sizeof(c->sb.user_uuid));
|
||||
}
|
||||
|
||||
#if 0
|
||||
static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
|
||||
{
|
||||
if (arg.flags || arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
return bch2_fs_start(c) ? -EIO : 0;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_stop(struct bch_fs *c)
|
||||
{
|
||||
bch2_fs_stop(c);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
|
||||
{
|
||||
char *path;
|
||||
int ret;
|
||||
|
||||
if (arg.flags || arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = bch2_dev_add(c, path);
|
||||
kfree(path);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
|
||||
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
|
||||
BCH_FORCE_IF_METADATA_LOST|
|
||||
BCH_FORCE_IF_DEGRADED|
|
||||
BCH_BY_INDEX)) ||
|
||||
arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
||||
if (IS_ERR(ca))
|
||||
return PTR_ERR(ca);
|
||||
|
||||
return bch2_dev_remove(c, ca, arg.flags);
|
||||
}
|
||||
|
||||
static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
|
||||
{
|
||||
char *path;
|
||||
int ret;
|
||||
|
||||
if (arg.flags || arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
|
||||
if (!path)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = bch2_dev_online(c, path);
|
||||
kfree(path);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
int ret;
|
||||
|
||||
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
|
||||
BCH_FORCE_IF_METADATA_LOST|
|
||||
BCH_FORCE_IF_DEGRADED|
|
||||
BCH_BY_INDEX)) ||
|
||||
arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
||||
if (IS_ERR(ca))
|
||||
return PTR_ERR(ca);
|
||||
|
||||
ret = bch2_dev_offline(c, ca, arg.flags);
|
||||
percpu_ref_put(&ca->ref);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_disk_set_state(struct bch_fs *c,
|
||||
struct bch_ioctl_disk_set_state arg)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
int ret;
|
||||
|
||||
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
|
||||
BCH_FORCE_IF_METADATA_LOST|
|
||||
BCH_FORCE_IF_DEGRADED|
|
||||
BCH_BY_INDEX)) ||
|
||||
arg.pad[0] || arg.pad[1] || arg.pad[2])
|
||||
return -EINVAL;
|
||||
|
||||
ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
||||
if (IS_ERR(ca))
|
||||
return PTR_ERR(ca);
|
||||
|
||||
ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
|
||||
|
||||
percpu_ref_put(&ca->ref);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bch_data_ctx {
|
||||
struct bch_fs *c;
|
||||
struct bch_ioctl_data arg;
|
||||
struct bch_move_stats stats;
|
||||
|
||||
int ret;
|
||||
|
||||
struct task_struct *thread;
|
||||
};
|
||||
|
||||
static int bch2_data_thread(void *arg)
|
||||
{
|
||||
struct bch_data_ctx *ctx = arg;
|
||||
|
||||
ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
|
||||
|
||||
ctx->stats.data_type = U8_MAX;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_data_job_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct bch_data_ctx *ctx = file->private_data;
|
||||
|
||||
kthread_stop(ctx->thread);
|
||||
put_task_struct(ctx->thread);
|
||||
kfree(ctx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
|
||||
size_t len, loff_t *ppos)
|
||||
{
|
||||
struct bch_data_ctx *ctx = file->private_data;
|
||||
struct bch_fs *c = ctx->c;
|
||||
struct bch_ioctl_data_event e = {
|
||||
.type = BCH_DATA_EVENT_PROGRESS,
|
||||
.p.data_type = ctx->stats.data_type,
|
||||
.p.btree_id = ctx->stats.iter.btree_id,
|
||||
.p.pos = ctx->stats.iter.pos,
|
||||
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
||||
.p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
|
||||
};
|
||||
|
||||
if (len < sizeof(e))
|
||||
return -EINVAL;
|
||||
|
||||
return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
|
||||
}
|
||||
|
||||
static const struct file_operations bcachefs_data_ops = {
|
||||
.release = bch2_data_job_release,
|
||||
.read = bch2_data_job_read,
|
||||
.llseek = no_llseek,
|
||||
};
|
||||
|
||||
static long bch2_ioctl_data(struct bch_fs *c,
|
||||
struct bch_ioctl_data arg)
|
||||
{
|
||||
struct bch_data_ctx *ctx = NULL;
|
||||
struct file *file = NULL;
|
||||
unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
|
||||
int ret, fd = -1;
|
||||
|
||||
if (arg.op >= BCH_DATA_OP_NR || arg.flags)
|
||||
return -EINVAL;
|
||||
|
||||
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
|
||||
ctx->c = c;
|
||||
ctx->arg = arg;
|
||||
|
||||
ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
|
||||
if (IS_ERR(ctx->thread)) {
|
||||
ret = PTR_ERR(ctx->thread);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = get_unused_fd_flags(flags);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
fd = ret;
|
||||
|
||||
file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
|
||||
if (IS_ERR(file)) {
|
||||
ret = PTR_ERR(file);
|
||||
goto err;
|
||||
}
|
||||
|
||||
fd_install(fd, file);
|
||||
|
||||
get_task_struct(ctx->thread);
|
||||
wake_up_process(ctx->thread);
|
||||
|
||||
return fd;
|
||||
err:
|
||||
if (fd >= 0)
|
||||
put_unused_fd(fd);
|
||||
if (!IS_ERR_OR_NULL(ctx->thread))
|
||||
kthread_stop(ctx->thread);
|
||||
kfree(ctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_usage(struct bch_fs *c,
|
||||
struct bch_ioctl_usage __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_usage arg;
|
||||
struct bch_dev *ca;
|
||||
unsigned i, j;
|
||||
int ret;
|
||||
|
||||
if (!test_bit(BCH_FS_STARTED, &c->flags))
|
||||
return -EINVAL;
|
||||
|
||||
if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
|
||||
for (i = 0; i < arg.nr_devices; i++) {
|
||||
struct bch_ioctl_dev_usage dst = { .alive = 0 };
|
||||
|
||||
ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
{
|
||||
struct bch_fs_usage src = bch2_fs_usage_read(c);
|
||||
struct bch_ioctl_fs_usage dst = {
|
||||
.capacity = c->capacity,
|
||||
.used = bch2_fs_sectors_used(c, src),
|
||||
.online_reserved = src.online_reserved,
|
||||
};
|
||||
|
||||
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
|
||||
dst.persistent_reserved[i] =
|
||||
src.s[i].persistent_reserved;
|
||||
|
||||
for (j = 0; j < S_ALLOC_NR; j++)
|
||||
dst.sectors[s_alloc_to_data_type(j)][i] =
|
||||
src.s[i].data[j];
|
||||
}
|
||||
|
||||
ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
for_each_member_device(ca, c, i) {
|
||||
struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
|
||||
struct bch_ioctl_dev_usage dst = {
|
||||
.alive = 1,
|
||||
.state = ca->mi.state,
|
||||
.bucket_size = ca->mi.bucket_size,
|
||||
.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket,
|
||||
};
|
||||
|
||||
if (ca->dev_idx >= arg.nr_devices) {
|
||||
percpu_ref_put(&ca->ref);
|
||||
return -ERANGE;
|
||||
}
|
||||
|
||||
if (percpu_ref_tryget(&ca->io_ref)) {
|
||||
dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
}
|
||||
|
||||
for (j = 0; j < BCH_DATA_NR; j++) {
|
||||
dst.buckets[j] = src.buckets[j];
|
||||
dst.sectors[j] = src.sectors[j];
|
||||
}
|
||||
|
||||
ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_read_super(struct bch_fs *c,
|
||||
struct bch_ioctl_read_super arg)
|
||||
{
|
||||
struct bch_dev *ca = NULL;
|
||||
struct bch_sb *sb;
|
||||
int ret = 0;
|
||||
|
||||
if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
|
||||
arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
if (arg.flags & BCH_READ_DEV) {
|
||||
ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
||||
|
||||
if (IS_ERR(ca)) {
|
||||
ret = PTR_ERR(ca);
|
||||
goto err;
|
||||
}
|
||||
|
||||
sb = ca->disk_sb.sb;
|
||||
} else {
|
||||
sb = c->disk_sb.sb;
|
||||
}
|
||||
|
||||
if (vstruct_bytes(sb) > arg.size) {
|
||||
ret = -ERANGE;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = copy_to_user((void __user *)(unsigned long)arg.sb,
|
||||
sb, vstruct_bytes(sb));
|
||||
err:
|
||||
if (ca)
|
||||
percpu_ref_put(&ca->ref);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
|
||||
struct bch_ioctl_disk_get_idx arg)
|
||||
{
|
||||
dev_t dev = huge_decode_dev(arg.dev);
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
if (ca->disk_sb.bdev->bd_dev == dev) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
return i;
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static long bch2_ioctl_disk_resize(struct bch_fs *c,
|
||||
struct bch_ioctl_disk_resize arg)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
int ret;
|
||||
|
||||
if ((arg.flags & ~BCH_BY_INDEX) ||
|
||||
arg.pad)
|
||||
return -EINVAL;
|
||||
|
||||
ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
||||
if (IS_ERR(ca))
|
||||
return PTR_ERR(ca);
|
||||
|
||||
ret = bch2_dev_resize(c, ca, arg.nbuckets);
|
||||
|
||||
percpu_ref_put(&ca->ref);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BCH_IOCTL(_name, _argtype) \
|
||||
do { \
|
||||
_argtype i; \
|
||||
\
|
||||
if (copy_from_user(&i, arg, sizeof(i))) \
|
||||
return -EFAULT; \
|
||||
return bch2_ioctl_##_name(c, i); \
|
||||
} while (0)
|
||||
|
||||
long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
||||
{
|
||||
/* ioctls that don't require admin cap: */
|
||||
switch (cmd) {
|
||||
case BCH_IOCTL_QUERY_UUID:
|
||||
return bch2_ioctl_query_uuid(c, arg);
|
||||
case BCH_IOCTL_USAGE:
|
||||
return bch2_ioctl_usage(c, arg);
|
||||
}
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
switch (cmd) {
|
||||
#if 0
|
||||
case BCH_IOCTL_START:
|
||||
BCH_IOCTL(start, struct bch_ioctl_start);
|
||||
case BCH_IOCTL_STOP:
|
||||
return bch2_ioctl_stop(c);
|
||||
#endif
|
||||
case BCH_IOCTL_READ_SUPER:
|
||||
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
|
||||
case BCH_IOCTL_DISK_GET_IDX:
|
||||
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
|
||||
}
|
||||
|
||||
if (!test_bit(BCH_FS_STARTED, &c->flags))
|
||||
return -EINVAL;
|
||||
|
||||
/* ioctls that do require admin cap: */
|
||||
switch (cmd) {
|
||||
case BCH_IOCTL_DISK_ADD:
|
||||
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
|
||||
case BCH_IOCTL_DISK_REMOVE:
|
||||
BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
|
||||
case BCH_IOCTL_DISK_ONLINE:
|
||||
BCH_IOCTL(disk_online, struct bch_ioctl_disk);
|
||||
case BCH_IOCTL_DISK_OFFLINE:
|
||||
BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
|
||||
case BCH_IOCTL_DISK_SET_STATE:
|
||||
BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
|
||||
case BCH_IOCTL_DATA:
|
||||
BCH_IOCTL(data, struct bch_ioctl_data);
|
||||
case BCH_IOCTL_DISK_RESIZE:
|
||||
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
|
||||
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
}
|
||||
|
||||
static DEFINE_IDR(bch_chardev_minor);
|
||||
|
||||
static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
|
||||
{
|
||||
unsigned minor = iminor(file_inode(filp));
|
||||
struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
|
||||
void __user *arg = (void __user *) v;
|
||||
|
||||
return c
|
||||
? bch2_fs_ioctl(c, cmd, arg)
|
||||
: bch2_global_ioctl(cmd, arg);
|
||||
}
|
||||
|
||||
static const struct file_operations bch_chardev_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.unlocked_ioctl = bch2_chardev_ioctl,
|
||||
.open = nonseekable_open,
|
||||
};
|
||||
|
||||
static int bch_chardev_major;
|
||||
static struct class *bch_chardev_class;
|
||||
static struct device *bch_chardev;
|
||||
|
||||
void bch2_fs_chardev_exit(struct bch_fs *c)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(c->chardev))
|
||||
device_unregister(c->chardev);
|
||||
if (c->minor >= 0)
|
||||
idr_remove(&bch_chardev_minor, c->minor);
|
||||
}
|
||||
|
||||
int bch2_fs_chardev_init(struct bch_fs *c)
|
||||
{
|
||||
c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
|
||||
if (c->minor < 0)
|
||||
return c->minor;
|
||||
|
||||
c->chardev = device_create(bch_chardev_class, NULL,
|
||||
MKDEV(bch_chardev_major, c->minor), c,
|
||||
"bcachefs%u-ctl", c->minor);
|
||||
if (IS_ERR(c->chardev))
|
||||
return PTR_ERR(c->chardev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_chardev_exit(void)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(bch_chardev_class))
|
||||
device_destroy(bch_chardev_class,
|
||||
MKDEV(bch_chardev_major, U8_MAX));
|
||||
if (!IS_ERR_OR_NULL(bch_chardev_class))
|
||||
class_destroy(bch_chardev_class);
|
||||
if (bch_chardev_major > 0)
|
||||
unregister_chrdev(bch_chardev_major, "bcachefs");
|
||||
}
|
||||
|
||||
int __init bch2_chardev_init(void)
|
||||
{
|
||||
bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
|
||||
if (bch_chardev_major < 0)
|
||||
return bch_chardev_major;
|
||||
|
||||
bch_chardev_class = class_create("bcachefs");
|
||||
if (IS_ERR(bch_chardev_class))
|
||||
return PTR_ERR(bch_chardev_class);
|
||||
|
||||
bch_chardev = device_create(bch_chardev_class, NULL,
|
||||
MKDEV(bch_chardev_major, U8_MAX),
|
||||
NULL, "bcachefs-ctl");
|
||||
if (IS_ERR(bch_chardev))
|
||||
return PTR_ERR(bch_chardev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* NO_BCACHEFS_CHARDEV */
|
31
fs/bcachefs/chardev.h
Normal file
31
fs/bcachefs/chardev.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_CHARDEV_H
|
||||
#define _BCACHEFS_CHARDEV_H
|
||||
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
|
||||
|
||||
void bch2_fs_chardev_exit(struct bch_fs *);
|
||||
int bch2_fs_chardev_init(struct bch_fs *);
|
||||
|
||||
void bch2_chardev_exit(void);
|
||||
int __init bch2_chardev_init(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline long bch2_fs_ioctl(struct bch_fs *c,
|
||||
unsigned cmd, void __user * arg)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
|
||||
|
||||
static inline void bch2_chardev_exit(void) {}
|
||||
static inline int __init bch2_chardev_init(void) { return 0; }
|
||||
|
||||
#endif /* NO_BCACHEFS_FS */
|
||||
|
||||
#endif /* _BCACHEFS_CHARDEV_H */
|
753
fs/bcachefs/checksum.c
Normal file
753
fs/bcachefs/checksum.c
Normal file
@ -0,0 +1,753 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "checksum.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/crc32c.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/key.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/hash.h>
|
||||
#include <crypto/poly1305.h>
|
||||
#include <crypto/skcipher.h>
|
||||
#include <keys/user-type.h>
|
||||
|
||||
/*
|
||||
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
|
||||
* use permitted, subject to terms of PostgreSQL license; see.)
|
||||
|
||||
* If we have a 64-bit integer type, then a 64-bit CRC looks just like the
|
||||
* usual sort of implementation. (See Ross Williams' excellent introduction
|
||||
* A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
|
||||
* ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
|
||||
* If we have no working 64-bit type, then fake it with two 32-bit registers.
|
||||
*
|
||||
* The present implementation is a normal (not "reflected", in Williams'
|
||||
* terms) 64-bit CRC, using initial all-ones register contents and a final
|
||||
* bit inversion. The chosen polynomial is borrowed from the DLT1 spec
|
||||
* (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
|
||||
*
|
||||
* x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
|
||||
* x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
|
||||
* x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
|
||||
* x^7 + x^4 + x + 1
|
||||
*/
|
||||
|
||||
static const u64 crc_table[256] = {
|
||||
0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
|
||||
0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
|
||||
0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
|
||||
0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
|
||||
0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
|
||||
0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
|
||||
0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
|
||||
0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
|
||||
0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
|
||||
0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
|
||||
0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
|
||||
0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
|
||||
0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
|
||||
0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
|
||||
0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
|
||||
0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
|
||||
0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
|
||||
0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
|
||||
0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
|
||||
0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
|
||||
0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
|
||||
0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
|
||||
0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
|
||||
0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
|
||||
0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
|
||||
0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
|
||||
0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
|
||||
0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
|
||||
0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
|
||||
0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
|
||||
0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
|
||||
0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
|
||||
0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
|
||||
0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
|
||||
0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
|
||||
0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
|
||||
0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
|
||||
0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
|
||||
0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
|
||||
0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
|
||||
0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
|
||||
0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
|
||||
0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
|
||||
0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
|
||||
0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
|
||||
0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
|
||||
0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
|
||||
0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
|
||||
0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
|
||||
0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
|
||||
0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
|
||||
0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
|
||||
0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
|
||||
0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
|
||||
0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
|
||||
0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
|
||||
0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
|
||||
0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
|
||||
0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
|
||||
0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
|
||||
0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
|
||||
0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
|
||||
0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
|
||||
0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
|
||||
0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
|
||||
0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
|
||||
0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
|
||||
0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
|
||||
0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
|
||||
0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
|
||||
0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
|
||||
0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
|
||||
0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
|
||||
0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
|
||||
0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
|
||||
0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
|
||||
0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
|
||||
0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
|
||||
0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
|
||||
0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
|
||||
0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
|
||||
0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
|
||||
0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
|
||||
0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
|
||||
0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
|
||||
0x9AFCE626CE85B507ULL,
|
||||
};
|
||||
|
||||
u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
|
||||
{
|
||||
const unsigned char *data = _data;
|
||||
|
||||
while (len--) {
|
||||
int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
|
||||
crc = crc_table[i] ^ (crc << 8);
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
static u64 bch2_checksum_init(unsigned type)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
return U32_MAX;
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
return U64_MAX;
|
||||
case BCH_CSUM_CRC32C:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC64:
|
||||
return 0;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static u64 bch2_checksum_final(unsigned type, u64 crc)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
return crc ^ U32_MAX;
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
return crc ^ U64_MAX;
|
||||
case BCH_CSUM_CRC32C:
|
||||
return crc;
|
||||
case BCH_CSUM_CRC64:
|
||||
return crc;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return 0;
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
case BCH_CSUM_CRC32C:
|
||||
return crc32c(crc, data, len);
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
case BCH_CSUM_CRC64:
|
||||
return bch2_crc64_update(crc, data, len);
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
|
||||
struct nonce nonce,
|
||||
struct scatterlist *sg, size_t len)
|
||||
{
|
||||
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
|
||||
int ret;
|
||||
|
||||
skcipher_request_set_sync_tfm(req, tfm);
|
||||
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
|
||||
|
||||
ret = crypto_skcipher_encrypt(req);
|
||||
BUG_ON(ret);
|
||||
}
|
||||
|
||||
static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
|
||||
struct nonce nonce,
|
||||
void *buf, size_t len)
|
||||
{
|
||||
struct scatterlist sg;
|
||||
|
||||
sg_init_one(&sg, buf, len);
|
||||
do_encrypt_sg(tfm, nonce, &sg, len);
|
||||
}
|
||||
|
||||
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
|
||||
void *buf, size_t len)
|
||||
{
|
||||
struct crypto_sync_skcipher *chacha20 =
|
||||
crypto_alloc_sync_skcipher("chacha20", 0, 0);
|
||||
int ret;
|
||||
|
||||
if (!chacha20) {
|
||||
pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
|
||||
return PTR_ERR(chacha20);
|
||||
}
|
||||
|
||||
ret = crypto_skcipher_setkey(&chacha20->base,
|
||||
(void *) key, sizeof(*key));
|
||||
if (ret) {
|
||||
pr_err("crypto_skcipher_setkey() error: %i", ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
do_encrypt(chacha20, nonce, buf, len);
|
||||
err:
|
||||
crypto_free_sync_skcipher(chacha20);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
|
||||
struct nonce nonce)
|
||||
{
|
||||
u8 key[POLY1305_KEY_SIZE];
|
||||
|
||||
nonce.d[3] ^= BCH_NONCE_POLY;
|
||||
|
||||
memset(key, 0, sizeof(key));
|
||||
do_encrypt(c->chacha20, nonce, key, sizeof(key));
|
||||
|
||||
desc->tfm = c->poly1305;
|
||||
crypto_shash_init(desc);
|
||||
crypto_shash_update(desc, key, sizeof(key));
|
||||
}
|
||||
|
||||
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, const void *data, size_t len)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC64: {
|
||||
u64 crc = bch2_checksum_init(type);
|
||||
|
||||
crc = bch2_checksum_update(type, crc, data, len);
|
||||
crc = bch2_checksum_final(type, crc);
|
||||
|
||||
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
|
||||
}
|
||||
|
||||
case BCH_CSUM_CHACHA20_POLY1305_80:
|
||||
case BCH_CSUM_CHACHA20_POLY1305_128: {
|
||||
SHASH_DESC_ON_STACK(desc, c->poly1305);
|
||||
u8 digest[POLY1305_DIGEST_SIZE];
|
||||
struct bch_csum ret = { 0 };
|
||||
|
||||
gen_poly_key(c, desc, nonce);
|
||||
|
||||
crypto_shash_update(desc, data, len);
|
||||
crypto_shash_final(desc, digest);
|
||||
|
||||
memcpy(&ret, digest, bch_crc_bytes[type]);
|
||||
return ret;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_encrypt(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, void *data, size_t len)
|
||||
{
|
||||
if (!bch2_csum_type_is_encryption(type))
|
||||
return;
|
||||
|
||||
do_encrypt(c->chacha20, nonce, data, len);
|
||||
}
|
||||
|
||||
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio,
|
||||
struct bvec_iter *iter)
|
||||
{
|
||||
struct bio_vec bv;
|
||||
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
return (struct bch_csum) { 0 };
|
||||
case BCH_CSUM_CRC32C_NONZERO:
|
||||
case BCH_CSUM_CRC64_NONZERO:
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC64: {
|
||||
u64 crc = bch2_checksum_init(type);
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
__bio_for_each_segment(bv, bio, *iter, *iter) {
|
||||
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
|
||||
crc = bch2_checksum_update(type,
|
||||
crc, p, bv.bv_len);
|
||||
kunmap_atomic(p);
|
||||
}
|
||||
#else
|
||||
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
|
||||
crc = bch2_checksum_update(type, crc,
|
||||
page_address(bv.bv_page) + bv.bv_offset,
|
||||
bv.bv_len);
|
||||
#endif
|
||||
crc = bch2_checksum_final(type, crc);
|
||||
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
|
||||
}
|
||||
|
||||
case BCH_CSUM_CHACHA20_POLY1305_80:
|
||||
case BCH_CSUM_CHACHA20_POLY1305_128: {
|
||||
SHASH_DESC_ON_STACK(desc, c->poly1305);
|
||||
u8 digest[POLY1305_DIGEST_SIZE];
|
||||
struct bch_csum ret = { 0 };
|
||||
|
||||
gen_poly_key(c, desc, nonce);
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
__bio_for_each_segment(bv, bio, *iter, *iter) {
|
||||
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
|
||||
|
||||
crypto_shash_update(desc, p, bv.bv_len);
|
||||
kunmap_atomic(p);
|
||||
}
|
||||
#else
|
||||
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
|
||||
crypto_shash_update(desc,
|
||||
page_address(bv.bv_page) + bv.bv_offset,
|
||||
bv.bv_len);
|
||||
#endif
|
||||
crypto_shash_final(desc, digest);
|
||||
|
||||
memcpy(&ret, digest, bch_crc_bytes[type]);
|
||||
return ret;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio)
|
||||
{
|
||||
struct bvec_iter iter = bio->bi_iter;
|
||||
|
||||
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
|
||||
}
|
||||
|
||||
void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
struct nonce nonce, struct bio *bio)
|
||||
{
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
struct scatterlist sgl[16], *sg = sgl;
|
||||
size_t bytes = 0;
|
||||
|
||||
if (!bch2_csum_type_is_encryption(type))
|
||||
return;
|
||||
|
||||
sg_init_table(sgl, ARRAY_SIZE(sgl));
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
if (sg == sgl + ARRAY_SIZE(sgl)) {
|
||||
sg_mark_end(sg - 1);
|
||||
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
||||
|
||||
nonce = nonce_add(nonce, bytes);
|
||||
bytes = 0;
|
||||
|
||||
sg_init_table(sgl, ARRAY_SIZE(sgl));
|
||||
sg = sgl;
|
||||
}
|
||||
|
||||
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
|
||||
bytes += bv.bv_len;
|
||||
}
|
||||
|
||||
sg_mark_end(sg - 1);
|
||||
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
||||
}
|
||||
|
||||
static inline bool bch2_checksum_mergeable(unsigned type)
|
||||
{
|
||||
|
||||
switch (type) {
|
||||
case BCH_CSUM_NONE:
|
||||
case BCH_CSUM_CRC32C:
|
||||
case BCH_CSUM_CRC64:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bch_csum bch2_checksum_merge(unsigned type,
|
||||
struct bch_csum a,
|
||||
struct bch_csum b, size_t b_len)
|
||||
{
|
||||
BUG_ON(!bch2_checksum_mergeable(type));
|
||||
|
||||
while (b_len) {
|
||||
unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
|
||||
|
||||
a.lo = bch2_checksum_update(type, a.lo,
|
||||
page_address(ZERO_PAGE(0)), b);
|
||||
b_len -= b;
|
||||
}
|
||||
|
||||
a.lo ^= b.lo;
|
||||
a.hi ^= b.hi;
|
||||
return a;
|
||||
}
|
||||
|
||||
int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
||||
struct bversion version,
|
||||
struct bch_extent_crc_unpacked crc_old,
|
||||
struct bch_extent_crc_unpacked *crc_a,
|
||||
struct bch_extent_crc_unpacked *crc_b,
|
||||
unsigned len_a, unsigned len_b,
|
||||
unsigned new_csum_type)
|
||||
{
|
||||
struct bvec_iter iter = bio->bi_iter;
|
||||
struct nonce nonce = extent_nonce(version, crc_old);
|
||||
struct bch_csum merged = { 0 };
|
||||
struct crc_split {
|
||||
struct bch_extent_crc_unpacked *crc;
|
||||
unsigned len;
|
||||
unsigned csum_type;
|
||||
struct bch_csum csum;
|
||||
} splits[3] = {
|
||||
{ crc_a, len_a, new_csum_type },
|
||||
{ crc_b, len_b, new_csum_type },
|
||||
{ NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
|
||||
}, *i;
|
||||
bool mergeable = crc_old.csum_type == new_csum_type &&
|
||||
bch2_checksum_mergeable(new_csum_type);
|
||||
unsigned crc_nonce = crc_old.nonce;
|
||||
|
||||
BUG_ON(len_a + len_b > bio_sectors(bio));
|
||||
BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
|
||||
BUG_ON(crc_old.compression_type);
|
||||
BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
|
||||
bch2_csum_type_is_encryption(new_csum_type));
|
||||
|
||||
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
||||
iter.bi_size = i->len << 9;
|
||||
if (mergeable || i->crc)
|
||||
i->csum = __bch2_checksum_bio(c, i->csum_type,
|
||||
nonce, bio, &iter);
|
||||
else
|
||||
bio_advance_iter(bio, &iter, i->len << 9);
|
||||
nonce = nonce_add(nonce, i->len << 9);
|
||||
}
|
||||
|
||||
if (mergeable)
|
||||
for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
|
||||
merged = bch2_checksum_merge(new_csum_type, merged,
|
||||
i->csum, i->len << 9);
|
||||
else
|
||||
merged = bch2_checksum_bio(c, crc_old.csum_type,
|
||||
extent_nonce(version, crc_old), bio);
|
||||
|
||||
if (bch2_crc_cmp(merged, crc_old.csum))
|
||||
return -EIO;
|
||||
|
||||
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
||||
if (i->crc)
|
||||
*i->crc = (struct bch_extent_crc_unpacked) {
|
||||
.csum_type = i->csum_type,
|
||||
.compressed_size = i->len,
|
||||
.uncompressed_size = i->len,
|
||||
.offset = 0,
|
||||
.live_size = i->len,
|
||||
.nonce = crc_nonce,
|
||||
.csum = i->csum,
|
||||
};
|
||||
|
||||
if (bch2_csum_type_is_encryption(new_csum_type))
|
||||
crc_nonce += i->len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
|
||||
{
|
||||
char key_description[60];
|
||||
struct key *keyring_key;
|
||||
const struct user_key_payload *ukp;
|
||||
int ret;
|
||||
|
||||
snprintf(key_description, sizeof(key_description),
|
||||
"bcachefs:%pUb", &sb->user_uuid);
|
||||
|
||||
keyring_key = request_key(&key_type_logon, key_description, NULL);
|
||||
if (IS_ERR(keyring_key))
|
||||
return PTR_ERR(keyring_key);
|
||||
|
||||
down_read(&keyring_key->sem);
|
||||
ukp = dereference_key_locked(keyring_key);
|
||||
if (ukp->datalen == sizeof(*key)) {
|
||||
memcpy(key, ukp->data, ukp->datalen);
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
}
|
||||
up_read(&keyring_key->sem);
|
||||
key_put(keyring_key);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
#include <keyutils.h>
|
||||
#include <uuid/uuid.h>
|
||||
|
||||
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
|
||||
{
|
||||
key_serial_t key_id;
|
||||
char key_description[60];
|
||||
char uuid[40];
|
||||
|
||||
uuid_unparse_lower(sb->user_uuid.b, uuid);
|
||||
sprintf(key_description, "bcachefs:%s", uuid);
|
||||
|
||||
key_id = request_key("user", key_description, NULL,
|
||||
KEY_SPEC_USER_KEYRING);
|
||||
if (key_id < 0)
|
||||
return -errno;
|
||||
|
||||
if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int bch2_decrypt_sb_key(struct bch_fs *c,
|
||||
struct bch_sb_field_crypt *crypt,
|
||||
struct bch_key *key)
|
||||
{
|
||||
struct bch_encrypted_key sb_key = crypt->key;
|
||||
struct bch_key user_key;
|
||||
int ret = 0;
|
||||
|
||||
/* is key encrypted? */
|
||||
if (!bch2_key_is_encrypted(&sb_key))
|
||||
goto out;
|
||||
|
||||
ret = bch2_request_key(c->disk_sb.sb, &user_key);
|
||||
if (ret) {
|
||||
bch_err(c, "error requesting encryption key: %i", ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* decrypt real key: */
|
||||
ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
|
||||
&sb_key, sizeof(sb_key));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (bch2_key_is_encrypted(&sb_key)) {
|
||||
bch_err(c, "incorrect encryption key");
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
out:
|
||||
*key = sb_key.key;
|
||||
err:
|
||||
memzero_explicit(&sb_key, sizeof(sb_key));
|
||||
memzero_explicit(&user_key, sizeof(user_key));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_alloc_ciphers(struct bch_fs *c)
|
||||
{
|
||||
if (!c->chacha20)
|
||||
c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
|
||||
if (IS_ERR(c->chacha20)) {
|
||||
bch_err(c, "error requesting chacha20 module: %li",
|
||||
PTR_ERR(c->chacha20));
|
||||
return PTR_ERR(c->chacha20);
|
||||
}
|
||||
|
||||
if (!c->poly1305)
|
||||
c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
|
||||
if (IS_ERR(c->poly1305)) {
|
||||
bch_err(c, "error requesting poly1305 module: %li",
|
||||
PTR_ERR(c->poly1305));
|
||||
return PTR_ERR(c->poly1305);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_disable_encryption(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt;
|
||||
struct bch_key key;
|
||||
int ret = -EINVAL;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
crypt = bch2_sb_get_crypt(c->disk_sb.sb);
|
||||
if (!crypt)
|
||||
goto out;
|
||||
|
||||
/* is key encrypted? */
|
||||
ret = 0;
|
||||
if (bch2_key_is_encrypted(&crypt->key))
|
||||
goto out;
|
||||
|
||||
ret = bch2_decrypt_sb_key(c, crypt, &key);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
crypt->key.magic = BCH_KEY_MAGIC;
|
||||
crypt->key.key = key;
|
||||
|
||||
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
|
||||
bch2_write_super(c);
|
||||
out:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
||||
{
|
||||
struct bch_encrypted_key key;
|
||||
struct bch_key user_key;
|
||||
struct bch_sb_field_crypt *crypt;
|
||||
int ret = -EINVAL;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
/* Do we already have an encryption key? */
|
||||
if (bch2_sb_get_crypt(c->disk_sb.sb))
|
||||
goto err;
|
||||
|
||||
ret = bch2_alloc_ciphers(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
key.magic = BCH_KEY_MAGIC;
|
||||
get_random_bytes(&key.key, sizeof(key.key));
|
||||
|
||||
if (keyed) {
|
||||
ret = bch2_request_key(c->disk_sb.sb, &user_key);
|
||||
if (ret) {
|
||||
bch_err(c, "error requesting encryption key: %i", ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
|
||||
&key, sizeof(key));
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = crypto_skcipher_setkey(&c->chacha20->base,
|
||||
(void *) &key.key, sizeof(key.key));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
|
||||
if (!crypt) {
|
||||
ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
|
||||
goto err;
|
||||
}
|
||||
|
||||
crypt->key = key;
|
||||
|
||||
/* write superblock */
|
||||
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
|
||||
bch2_write_super(c);
|
||||
err:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
memzero_explicit(&user_key, sizeof(user_key));
|
||||
memzero_explicit(&key, sizeof(key));
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_fs_encryption_exit(struct bch_fs *c)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(c->poly1305))
|
||||
crypto_free_shash(c->poly1305);
|
||||
if (!IS_ERR_OR_NULL(c->chacha20))
|
||||
crypto_free_sync_skcipher(c->chacha20);
|
||||
if (!IS_ERR_OR_NULL(c->sha256))
|
||||
crypto_free_shash(c->sha256);
|
||||
}
|
||||
|
||||
int bch2_fs_encryption_init(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_crypt *crypt;
|
||||
struct bch_key key;
|
||||
int ret = 0;
|
||||
|
||||
pr_verbose_init(c->opts, "");
|
||||
|
||||
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
|
||||
if (IS_ERR(c->sha256)) {
|
||||
bch_err(c, "error requesting sha256 module");
|
||||
ret = PTR_ERR(c->sha256);
|
||||
goto out;
|
||||
}
|
||||
|
||||
crypt = bch2_sb_get_crypt(c->disk_sb.sb);
|
||||
if (!crypt)
|
||||
goto out;
|
||||
|
||||
ret = bch2_alloc_ciphers(c);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = bch2_decrypt_sb_key(c, crypt, &key);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = crypto_skcipher_setkey(&c->chacha20->base,
|
||||
(void *) &key.key, sizeof(key.key));
|
||||
if (ret)
|
||||
goto out;
|
||||
out:
|
||||
memzero_explicit(&key, sizeof(key));
|
||||
pr_verbose_init(c->opts, "ret %i", ret);
|
||||
return ret;
|
||||
}
|
184
fs/bcachefs/checksum.h
Normal file
184
fs/bcachefs/checksum.h
Normal file
@ -0,0 +1,184 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_CHECKSUM_H
|
||||
#define _BCACHEFS_CHECKSUM_H
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "extents_types.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <crypto/chacha.h>
|
||||
|
||||
u64 bch2_crc64_update(u64, const void *, size_t);
|
||||
|
||||
#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
|
||||
#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
|
||||
#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
|
||||
#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
|
||||
#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
|
||||
|
||||
struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
|
||||
const void *, size_t);
|
||||
|
||||
/*
|
||||
* This is used for various on disk data structures - bch_sb, prio_set, bset,
|
||||
* jset: The checksum is _always_ the first field of these structs
|
||||
*/
|
||||
#define csum_vstruct(_c, _type, _nonce, _i) \
|
||||
({ \
|
||||
const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
|
||||
const void *end = vstruct_end(_i); \
|
||||
\
|
||||
bch2_checksum(_c, _type, _nonce, start, end - start); \
|
||||
})
|
||||
|
||||
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
|
||||
int bch2_request_key(struct bch_sb *, struct bch_key *);
|
||||
|
||||
void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
|
||||
void *data, size_t);
|
||||
|
||||
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
|
||||
struct nonce, struct bio *);
|
||||
|
||||
int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
|
||||
struct bch_extent_crc_unpacked,
|
||||
struct bch_extent_crc_unpacked *,
|
||||
struct bch_extent_crc_unpacked *,
|
||||
unsigned, unsigned, unsigned);
|
||||
|
||||
void bch2_encrypt_bio(struct bch_fs *, unsigned,
|
||||
struct nonce, struct bio *);
|
||||
|
||||
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
|
||||
struct bch_key *);
|
||||
|
||||
int bch2_disable_encryption(struct bch_fs *);
|
||||
int bch2_enable_encryption(struct bch_fs *, bool);
|
||||
|
||||
void bch2_fs_encryption_exit(struct bch_fs *);
|
||||
int bch2_fs_encryption_init(struct bch_fs *);
|
||||
|
||||
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
|
||||
bool data)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_CSUM_OPT_NONE:
|
||||
return BCH_CSUM_NONE;
|
||||
case BCH_CSUM_OPT_CRC32C:
|
||||
return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
|
||||
case BCH_CSUM_OPT_CRC64:
|
||||
return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
|
||||
unsigned opt)
|
||||
{
|
||||
if (c->sb.encryption_type)
|
||||
return c->opts.wide_macs
|
||||
? BCH_CSUM_CHACHA20_POLY1305_128
|
||||
: BCH_CSUM_CHACHA20_POLY1305_80;
|
||||
|
||||
return bch2_csum_opt_to_type(opt, true);
|
||||
}
|
||||
|
||||
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
|
||||
{
|
||||
if (c->sb.encryption_type)
|
||||
return BCH_CSUM_CHACHA20_POLY1305_128;
|
||||
|
||||
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
|
||||
}
|
||||
|
||||
static const unsigned bch2_compression_opt_to_type[] = {
|
||||
#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
|
||||
BCH_COMPRESSION_TYPES()
|
||||
#undef x
|
||||
};
|
||||
|
||||
static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
|
||||
unsigned type)
|
||||
{
|
||||
if (type >= BCH_CSUM_NR)
|
||||
return false;
|
||||
|
||||
if (bch2_csum_type_is_encryption(type) && !c->chacha20)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* returns true if not equal */
|
||||
static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
|
||||
{
|
||||
/*
|
||||
* XXX: need some way of preventing the compiler from optimizing this
|
||||
* into a form that isn't constant time..
|
||||
*/
|
||||
return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
|
||||
}
|
||||
|
||||
/* for skipping ahead and encrypting/decrypting at an offset: */
|
||||
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
|
||||
{
|
||||
EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
|
||||
|
||||
le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
|
||||
return nonce;
|
||||
}
|
||||
|
||||
static inline struct nonce null_nonce(void)
|
||||
{
|
||||
struct nonce ret;
|
||||
|
||||
memset(&ret, 0, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct nonce extent_nonce(struct bversion version,
|
||||
struct bch_extent_crc_unpacked crc)
|
||||
{
|
||||
unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
|
||||
struct nonce nonce = (struct nonce) {{
|
||||
[0] = cpu_to_le32(size << 22),
|
||||
[1] = cpu_to_le32(version.lo),
|
||||
[2] = cpu_to_le32(version.lo >> 32),
|
||||
[3] = cpu_to_le32(version.hi|
|
||||
(crc.compression_type << 24))^BCH_NONCE_EXTENT,
|
||||
}};
|
||||
|
||||
return nonce_add(nonce, crc.nonce << 9);
|
||||
}
|
||||
|
||||
static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
|
||||
{
|
||||
return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
|
||||
}
|
||||
|
||||
static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
|
||||
{
|
||||
__le64 magic = __bch2_sb_magic(sb);
|
||||
|
||||
return (struct nonce) {{
|
||||
[0] = 0,
|
||||
[1] = 0,
|
||||
[2] = ((__le32 *) &magic)[0],
|
||||
[3] = ((__le32 *) &magic)[1],
|
||||
}};
|
||||
}
|
||||
|
||||
static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
|
||||
{
|
||||
__le64 magic = bch2_sb_magic(c);
|
||||
|
||||
return (struct nonce) {{
|
||||
[0] = 0,
|
||||
[1] = 0,
|
||||
[2] = ((__le32 *) &magic)[0],
|
||||
[3] = ((__le32 *) &magic)[1],
|
||||
}};
|
||||
}
|
||||
|
||||
#endif /* _BCACHEFS_CHECKSUM_H */
|
180
fs/bcachefs/clock.c
Normal file
180
fs/bcachefs/clock.c
Normal file
@ -0,0 +1,180 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "clock.h"
|
||||
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/preempt.h>
|
||||
|
||||
static inline long io_timer_cmp(io_timer_heap *h,
|
||||
struct io_timer *l,
|
||||
struct io_timer *r)
|
||||
{
|
||||
return l->expire - r->expire;
|
||||
}
|
||||
|
||||
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
spin_lock(&clock->timer_lock);
|
||||
for (i = 0; i < clock->timers.used; i++)
|
||||
if (clock->timers.data[i] == timer)
|
||||
goto out;
|
||||
|
||||
BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
|
||||
out:
|
||||
spin_unlock(&clock->timer_lock);
|
||||
}
|
||||
|
||||
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
spin_lock(&clock->timer_lock);
|
||||
|
||||
for (i = 0; i < clock->timers.used; i++)
|
||||
if (clock->timers.data[i] == timer) {
|
||||
heap_del(&clock->timers, i, io_timer_cmp);
|
||||
break;
|
||||
}
|
||||
|
||||
spin_unlock(&clock->timer_lock);
|
||||
}
|
||||
|
||||
struct io_clock_wait {
|
||||
struct io_timer io_timer;
|
||||
struct timer_list cpu_timer;
|
||||
struct task_struct *task;
|
||||
int expired;
|
||||
};
|
||||
|
||||
static void io_clock_wait_fn(struct io_timer *timer)
|
||||
{
|
||||
struct io_clock_wait *wait = container_of(timer,
|
||||
struct io_clock_wait, io_timer);
|
||||
|
||||
wait->expired = 1;
|
||||
wake_up_process(wait->task);
|
||||
}
|
||||
|
||||
static void io_clock_cpu_timeout(struct timer_list *timer)
|
||||
{
|
||||
struct io_clock_wait *wait = container_of(timer,
|
||||
struct io_clock_wait, cpu_timer);
|
||||
|
||||
wait->expired = 1;
|
||||
wake_up_process(wait->task);
|
||||
}
|
||||
|
||||
void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
|
||||
{
|
||||
struct io_clock_wait wait;
|
||||
|
||||
/* XXX: calculate sleep time rigorously */
|
||||
wait.io_timer.expire = until;
|
||||
wait.io_timer.fn = io_clock_wait_fn;
|
||||
wait.task = current;
|
||||
wait.expired = 0;
|
||||
bch2_io_timer_add(clock, &wait.io_timer);
|
||||
|
||||
schedule();
|
||||
|
||||
bch2_io_timer_del(clock, &wait.io_timer);
|
||||
}
|
||||
|
||||
void bch2_kthread_io_clock_wait(struct io_clock *clock,
|
||||
unsigned long io_until,
|
||||
unsigned long cpu_timeout)
|
||||
{
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
struct io_clock_wait wait;
|
||||
|
||||
wait.io_timer.expire = io_until;
|
||||
wait.io_timer.fn = io_clock_wait_fn;
|
||||
wait.task = current;
|
||||
wait.expired = 0;
|
||||
bch2_io_timer_add(clock, &wait.io_timer);
|
||||
|
||||
timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
|
||||
|
||||
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
|
||||
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
|
||||
|
||||
while (1) {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
if (kthread && kthread_should_stop())
|
||||
break;
|
||||
|
||||
if (wait.expired)
|
||||
break;
|
||||
|
||||
schedule();
|
||||
try_to_freeze();
|
||||
}
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
del_timer_sync(&wait.cpu_timer);
|
||||
destroy_timer_on_stack(&wait.cpu_timer);
|
||||
bch2_io_timer_del(clock, &wait.io_timer);
|
||||
}
|
||||
|
||||
static struct io_timer *get_expired_timer(struct io_clock *clock,
|
||||
unsigned long now)
|
||||
{
|
||||
struct io_timer *ret = NULL;
|
||||
|
||||
spin_lock(&clock->timer_lock);
|
||||
|
||||
if (clock->timers.used &&
|
||||
time_after_eq(now, clock->timers.data[0]->expire))
|
||||
heap_pop(&clock->timers, ret, io_timer_cmp);
|
||||
|
||||
spin_unlock(&clock->timer_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
|
||||
{
|
||||
struct io_clock *clock = &c->io_clock[rw];
|
||||
struct io_timer *timer;
|
||||
unsigned long now;
|
||||
|
||||
/* Buffer up one megabyte worth of IO in the percpu counter */
|
||||
preempt_disable();
|
||||
|
||||
if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
|
||||
IO_CLOCK_PCPU_SECTORS)) {
|
||||
preempt_enable();
|
||||
return;
|
||||
}
|
||||
|
||||
sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
|
||||
preempt_enable();
|
||||
now = atomic_long_add_return(sectors, &clock->now);
|
||||
|
||||
while ((timer = get_expired_timer(clock, now)))
|
||||
timer->fn(timer);
|
||||
}
|
||||
|
||||
void bch2_io_clock_exit(struct io_clock *clock)
|
||||
{
|
||||
free_heap(&clock->timers);
|
||||
free_percpu(clock->pcpu_buf);
|
||||
}
|
||||
|
||||
int bch2_io_clock_init(struct io_clock *clock)
|
||||
{
|
||||
atomic_long_set(&clock->now, 0);
|
||||
spin_lock_init(&clock->timer_lock);
|
||||
|
||||
clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
|
||||
if (!clock->pcpu_buf)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
25
fs/bcachefs/clock.h
Normal file
25
fs/bcachefs/clock.h
Normal file
@ -0,0 +1,25 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_CLOCK_H
|
||||
#define _BCACHEFS_CLOCK_H
|
||||
|
||||
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
|
||||
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
|
||||
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
|
||||
unsigned long);
|
||||
void bch2_increment_clock(struct bch_fs *, unsigned, int);
|
||||
|
||||
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
|
||||
|
||||
#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
|
||||
({ \
|
||||
long __ret = timeout; \
|
||||
might_sleep(); \
|
||||
if (!___wait_cond_timeout(condition)) \
|
||||
__ret = __wait_event_timeout(wq, condition, timeout); \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
void bch2_io_clock_exit(struct io_clock *);
|
||||
int bch2_io_clock_init(struct io_clock *);
|
||||
|
||||
#endif /* _BCACHEFS_CLOCK_H */
|
36
fs/bcachefs/clock_types.h
Normal file
36
fs/bcachefs/clock_types.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_CLOCK_TYPES_H
|
||||
#define _BCACHEFS_CLOCK_TYPES_H
|
||||
|
||||
#include "util.h"
|
||||
|
||||
#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
|
||||
|
||||
/*
|
||||
* Clocks/timers in units of sectors of IO:
|
||||
*
|
||||
* Note - they use percpu batching, so they're only approximate.
|
||||
*/
|
||||
|
||||
struct io_timer;
|
||||
typedef void (*io_timer_fn)(struct io_timer *);
|
||||
|
||||
struct io_timer {
|
||||
io_timer_fn fn;
|
||||
unsigned long expire;
|
||||
};
|
||||
|
||||
/* Amount to buffer up on a percpu counter */
|
||||
#define IO_CLOCK_PCPU_SECTORS 128
|
||||
|
||||
typedef HEAP(struct io_timer *) io_timer_heap;
|
||||
|
||||
struct io_clock {
|
||||
atomic_long_t now;
|
||||
u16 __percpu *pcpu_buf;
|
||||
|
||||
spinlock_t timer_lock;
|
||||
io_timer_heap timers;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_CLOCK_TYPES_H */
|
621
fs/bcachefs/compress.c
Normal file
621
fs/bcachefs/compress.c
Normal file
@ -0,0 +1,621 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "checksum.h"
|
||||
#include "compress.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/lz4.h>
|
||||
#include <linux/zlib.h>
|
||||
#include <linux/zstd.h>
|
||||
|
||||
/* Bounce buffer: */
|
||||
struct bbuf {
|
||||
void *b;
|
||||
enum {
|
||||
BB_NONE,
|
||||
BB_VMAP,
|
||||
BB_KMALLOC,
|
||||
BB_VMALLOC,
|
||||
BB_MEMPOOL,
|
||||
} type;
|
||||
int rw;
|
||||
};
|
||||
|
||||
static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
|
||||
{
|
||||
void *b;
|
||||
|
||||
BUG_ON(size > c->sb.encoded_extent_max << 9);
|
||||
|
||||
b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
|
||||
if (b)
|
||||
return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
|
||||
|
||||
b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
|
||||
b = b ? page_address(b) : NULL;
|
||||
if (b)
|
||||
return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
|
||||
|
||||
b = vmalloc(size);
|
||||
if (b)
|
||||
return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
|
||||
|
||||
b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
|
||||
b = b ? page_address(b) : NULL;
|
||||
if (b)
|
||||
return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
|
||||
|
||||
BUG();
|
||||
}
|
||||
|
||||
static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
|
||||
struct bvec_iter start, int rw)
|
||||
{
|
||||
struct bbuf ret;
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
unsigned nr_pages = 0;
|
||||
struct page *stack_pages[16];
|
||||
struct page **pages = NULL;
|
||||
bool first = true;
|
||||
unsigned prev_end = PAGE_SIZE;
|
||||
void *data;
|
||||
|
||||
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
|
||||
|
||||
#ifndef CONFIG_HIGHMEM
|
||||
__bio_for_each_contig_segment(bv, bio, iter, start) {
|
||||
if (bv.bv_len == start.bi_size)
|
||||
return (struct bbuf) {
|
||||
.b = page_address(bv.bv_page) + bv.bv_offset,
|
||||
.type = BB_NONE, .rw = rw
|
||||
};
|
||||
}
|
||||
#endif
|
||||
__bio_for_each_segment(bv, bio, iter, start) {
|
||||
if ((!first && bv.bv_offset) ||
|
||||
prev_end != PAGE_SIZE)
|
||||
goto bounce;
|
||||
|
||||
prev_end = bv.bv_offset + bv.bv_len;
|
||||
nr_pages++;
|
||||
}
|
||||
|
||||
BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
|
||||
|
||||
pages = nr_pages > ARRAY_SIZE(stack_pages)
|
||||
? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
|
||||
: stack_pages;
|
||||
if (!pages)
|
||||
goto bounce;
|
||||
|
||||
nr_pages = 0;
|
||||
__bio_for_each_segment(bv, bio, iter, start)
|
||||
pages[nr_pages++] = bv.bv_page;
|
||||
|
||||
data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (pages != stack_pages)
|
||||
kfree(pages);
|
||||
|
||||
if (data)
|
||||
return (struct bbuf) {
|
||||
.b = data + bio_iter_offset(bio, start),
|
||||
.type = BB_VMAP, .rw = rw
|
||||
};
|
||||
bounce:
|
||||
ret = __bounce_alloc(c, start.bi_size, rw);
|
||||
|
||||
if (rw == READ)
|
||||
memcpy_from_bio(ret.b, bio, start);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
|
||||
{
|
||||
return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
|
||||
}
|
||||
|
||||
static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
|
||||
{
|
||||
switch (buf.type) {
|
||||
case BB_NONE:
|
||||
break;
|
||||
case BB_VMAP:
|
||||
vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
|
||||
break;
|
||||
case BB_KMALLOC:
|
||||
kfree(buf.b);
|
||||
break;
|
||||
case BB_VMALLOC:
|
||||
vfree(buf.b);
|
||||
break;
|
||||
case BB_MEMPOOL:
|
||||
mempool_free(virt_to_page(buf.b),
|
||||
&c->compression_bounce[buf.rw]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void zlib_set_workspace(z_stream *strm, void *workspace)
|
||||
{
|
||||
#ifdef __KERNEL__
|
||||
strm->workspace = workspace;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
void *dst_data, struct bch_extent_crc_unpacked crc)
|
||||
{
|
||||
struct bbuf src_data = { NULL };
|
||||
size_t src_len = src->bi_iter.bi_size;
|
||||
size_t dst_len = crc.uncompressed_size << 9;
|
||||
void *workspace;
|
||||
int ret;
|
||||
|
||||
src_data = bio_map_or_bounce(c, src, READ);
|
||||
|
||||
switch (crc.compression_type) {
|
||||
case BCH_COMPRESSION_LZ4_OLD:
|
||||
case BCH_COMPRESSION_LZ4:
|
||||
ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
||||
src_len, dst_len, dst_len);
|
||||
if (ret != dst_len)
|
||||
goto err;
|
||||
break;
|
||||
case BCH_COMPRESSION_GZIP: {
|
||||
z_stream strm = {
|
||||
.next_in = src_data.b,
|
||||
.avail_in = src_len,
|
||||
.next_out = dst_data,
|
||||
.avail_out = dst_len,
|
||||
};
|
||||
|
||||
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
|
||||
|
||||
zlib_set_workspace(&strm, workspace);
|
||||
zlib_inflateInit2(&strm, -MAX_WBITS);
|
||||
ret = zlib_inflate(&strm, Z_FINISH);
|
||||
|
||||
mempool_free(workspace, &c->decompress_workspace);
|
||||
|
||||
if (ret != Z_STREAM_END)
|
||||
goto err;
|
||||
break;
|
||||
}
|
||||
case BCH_COMPRESSION_ZSTD: {
|
||||
ZSTD_DCtx *ctx;
|
||||
size_t len;
|
||||
|
||||
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
|
||||
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
|
||||
|
||||
src_len = le32_to_cpup(src_data.b);
|
||||
|
||||
len = zstd_decompress_dctx(ctx,
|
||||
dst_data, dst_len,
|
||||
src_data.b + 4, src_len);
|
||||
|
||||
mempool_free(workspace, &c->decompress_workspace);
|
||||
|
||||
if (len != dst_len)
|
||||
goto err;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
ret = 0;
|
||||
out:
|
||||
bio_unmap_or_unbounce(c, src_data);
|
||||
return ret;
|
||||
err:
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
|
||||
struct bch_extent_crc_unpacked *crc)
|
||||
{
|
||||
struct bbuf data = { NULL };
|
||||
size_t dst_len = crc->uncompressed_size << 9;
|
||||
|
||||
/* bio must own its pages: */
|
||||
BUG_ON(!bio->bi_vcnt);
|
||||
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
|
||||
|
||||
if (crc->uncompressed_size > c->sb.encoded_extent_max ||
|
||||
crc->compressed_size > c->sb.encoded_extent_max) {
|
||||
bch_err(c, "error rewriting existing data: extent too big");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
data = __bounce_alloc(c, dst_len, WRITE);
|
||||
|
||||
if (__bio_uncompress(c, bio, data.b, *crc)) {
|
||||
bch_err(c, "error rewriting existing data: decompression error");
|
||||
bio_unmap_or_unbounce(c, data);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* might have to free existing pages and retry allocation from mempool -
|
||||
* do this _after_ decompressing:
|
||||
*/
|
||||
bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
|
||||
|
||||
memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
|
||||
|
||||
crc->csum_type = 0;
|
||||
crc->compression_type = 0;
|
||||
crc->compressed_size = crc->live_size;
|
||||
crc->uncompressed_size = crc->live_size;
|
||||
crc->offset = 0;
|
||||
crc->csum = (struct bch_csum) { 0, 0 };
|
||||
|
||||
bio_unmap_or_unbounce(c, data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
struct bio *dst, struct bvec_iter dst_iter,
|
||||
struct bch_extent_crc_unpacked crc)
|
||||
{
|
||||
struct bbuf dst_data = { NULL };
|
||||
size_t dst_len = crc.uncompressed_size << 9;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
if (crc.uncompressed_size > c->sb.encoded_extent_max ||
|
||||
crc.compressed_size > c->sb.encoded_extent_max)
|
||||
return -EIO;
|
||||
|
||||
dst_data = dst_len == dst_iter.bi_size
|
||||
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
|
||||
: __bounce_alloc(c, dst_len, WRITE);
|
||||
|
||||
ret = __bio_uncompress(c, src, dst_data.b, crc);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (dst_data.type != BB_NONE)
|
||||
memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
|
||||
err:
|
||||
bio_unmap_or_unbounce(c, dst_data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int attempt_compress(struct bch_fs *c,
|
||||
void *workspace,
|
||||
void *dst, size_t dst_len,
|
||||
void *src, size_t src_len,
|
||||
unsigned compression_type)
|
||||
{
|
||||
switch (compression_type) {
|
||||
case BCH_COMPRESSION_LZ4: {
|
||||
int len = src_len;
|
||||
int ret = LZ4_compress_destSize(
|
||||
src, dst,
|
||||
&len, dst_len,
|
||||
workspace);
|
||||
|
||||
if (len < src_len)
|
||||
return -len;
|
||||
|
||||
return ret;
|
||||
}
|
||||
case BCH_COMPRESSION_GZIP: {
|
||||
z_stream strm = {
|
||||
.next_in = src,
|
||||
.avail_in = src_len,
|
||||
.next_out = dst,
|
||||
.avail_out = dst_len,
|
||||
};
|
||||
|
||||
zlib_set_workspace(&strm, workspace);
|
||||
zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
|
||||
Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
|
||||
Z_DEFAULT_STRATEGY);
|
||||
|
||||
if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
|
||||
return 0;
|
||||
|
||||
if (zlib_deflateEnd(&strm) != Z_OK)
|
||||
return 0;
|
||||
|
||||
return strm.total_out;
|
||||
}
|
||||
case BCH_COMPRESSION_ZSTD: {
|
||||
ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
|
||||
zstd_cctx_workspace_bound(&c->zstd_params.cParams));
|
||||
|
||||
size_t len = zstd_compress_cctx(ctx,
|
||||
dst + 4, dst_len - 4,
|
||||
src, src_len,
|
||||
&c->zstd_params);
|
||||
if (zstd_is_error(len))
|
||||
return 0;
|
||||
|
||||
*((__le32 *) dst) = cpu_to_le32(len);
|
||||
return len + 4;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned __bio_compress(struct bch_fs *c,
|
||||
struct bio *dst, size_t *dst_len,
|
||||
struct bio *src, size_t *src_len,
|
||||
unsigned compression_type)
|
||||
{
|
||||
struct bbuf src_data = { NULL }, dst_data = { NULL };
|
||||
void *workspace;
|
||||
unsigned pad;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(compression_type >= BCH_COMPRESSION_NR);
|
||||
BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
|
||||
|
||||
/* If it's only one block, don't bother trying to compress: */
|
||||
if (bio_sectors(src) <= c->opts.block_size)
|
||||
return 0;
|
||||
|
||||
dst_data = bio_map_or_bounce(c, dst, WRITE);
|
||||
src_data = bio_map_or_bounce(c, src, READ);
|
||||
|
||||
workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
|
||||
|
||||
*src_len = src->bi_iter.bi_size;
|
||||
*dst_len = dst->bi_iter.bi_size;
|
||||
|
||||
/*
|
||||
* XXX: this algorithm sucks when the compression code doesn't tell us
|
||||
* how much would fit, like LZ4 does:
|
||||
*/
|
||||
while (1) {
|
||||
if (*src_len <= block_bytes(c)) {
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = attempt_compress(c, workspace,
|
||||
dst_data.b, *dst_len,
|
||||
src_data.b, *src_len,
|
||||
compression_type);
|
||||
if (ret > 0) {
|
||||
*dst_len = ret;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Didn't fit: should we retry with a smaller amount? */
|
||||
if (*src_len <= *dst_len) {
|
||||
ret = -1;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If ret is negative, it's a hint as to how much data would fit
|
||||
*/
|
||||
BUG_ON(-ret >= *src_len);
|
||||
|
||||
if (ret < 0)
|
||||
*src_len = -ret;
|
||||
else
|
||||
*src_len -= (*src_len - *dst_len) / 2;
|
||||
*src_len = round_down(*src_len, block_bytes(c));
|
||||
}
|
||||
|
||||
mempool_free(workspace, &c->compress_workspace[compression_type]);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/* Didn't get smaller: */
|
||||
if (round_up(*dst_len, block_bytes(c)) >= *src_len)
|
||||
goto err;
|
||||
|
||||
pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
|
||||
|
||||
memset(dst_data.b + *dst_len, 0, pad);
|
||||
*dst_len += pad;
|
||||
|
||||
if (dst_data.type != BB_NONE)
|
||||
memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
|
||||
|
||||
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
|
||||
BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
|
||||
BUG_ON(*dst_len & (block_bytes(c) - 1));
|
||||
BUG_ON(*src_len & (block_bytes(c) - 1));
|
||||
out:
|
||||
bio_unmap_or_unbounce(c, src_data);
|
||||
bio_unmap_or_unbounce(c, dst_data);
|
||||
return compression_type;
|
||||
err:
|
||||
compression_type = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
unsigned bch2_bio_compress(struct bch_fs *c,
|
||||
struct bio *dst, size_t *dst_len,
|
||||
struct bio *src, size_t *src_len,
|
||||
unsigned compression_type)
|
||||
{
|
||||
unsigned orig_dst = dst->bi_iter.bi_size;
|
||||
unsigned orig_src = src->bi_iter.bi_size;
|
||||
|
||||
/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
|
||||
src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
|
||||
c->sb.encoded_extent_max << 9);
|
||||
/* Don't generate a bigger output than input: */
|
||||
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
|
||||
|
||||
if (compression_type == BCH_COMPRESSION_LZ4_OLD)
|
||||
compression_type = BCH_COMPRESSION_LZ4;
|
||||
|
||||
compression_type =
|
||||
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
|
||||
|
||||
dst->bi_iter.bi_size = orig_dst;
|
||||
src->bi_iter.bi_size = orig_src;
|
||||
return compression_type;
|
||||
}
|
||||
|
||||
static int __bch2_fs_compress_init(struct bch_fs *, u64);
|
||||
|
||||
#define BCH_FEATURE_NONE 0
|
||||
|
||||
static const unsigned bch2_compression_opt_to_feature[] = {
|
||||
#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
|
||||
BCH_COMPRESSION_TYPES()
|
||||
#undef x
|
||||
};
|
||||
|
||||
#undef BCH_FEATURE_NONE
|
||||
|
||||
static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if ((c->sb.features & f) == f)
|
||||
return 0;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
if ((c->sb.features & f) == f) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = __bch2_fs_compress_init(c, c->sb.features|f);
|
||||
if (ret) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
c->disk_sb.sb->features[0] |= cpu_to_le64(f);
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_check_set_has_compressed_data(struct bch_fs *c,
|
||||
unsigned compression_type)
|
||||
{
|
||||
BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
|
||||
|
||||
return compression_type
|
||||
? __bch2_check_set_has_compressed_data(c,
|
||||
1ULL << bch2_compression_opt_to_feature[compression_type])
|
||||
: 0;
|
||||
}
|
||||
|
||||
void bch2_fs_compress_exit(struct bch_fs *c)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
mempool_exit(&c->decompress_workspace);
|
||||
for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
|
||||
mempool_exit(&c->compress_workspace[i]);
|
||||
mempool_exit(&c->compression_bounce[WRITE]);
|
||||
mempool_exit(&c->compression_bounce[READ]);
|
||||
}
|
||||
|
||||
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
||||
{
|
||||
size_t max_extent = c->sb.encoded_extent_max << 9;
|
||||
size_t order = get_order(max_extent);
|
||||
size_t decompress_workspace_size = 0;
|
||||
bool decompress_workspace_needed;
|
||||
ZSTD_parameters params = zstd_get_params(0, max_extent);
|
||||
struct {
|
||||
unsigned feature;
|
||||
unsigned type;
|
||||
size_t compress_workspace;
|
||||
size_t decompress_workspace;
|
||||
} compression_types[] = {
|
||||
{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
|
||||
{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
|
||||
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
|
||||
zlib_inflate_workspacesize(), },
|
||||
{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
|
||||
zstd_cctx_workspace_bound(¶ms.cParams),
|
||||
zstd_dctx_workspace_bound() },
|
||||
}, *i;
|
||||
int ret = 0;
|
||||
|
||||
pr_verbose_init(c->opts, "");
|
||||
|
||||
c->zstd_params = params;
|
||||
|
||||
for (i = compression_types;
|
||||
i < compression_types + ARRAY_SIZE(compression_types);
|
||||
i++)
|
||||
if (features & (1 << i->feature))
|
||||
goto have_compressed;
|
||||
|
||||
goto out;
|
||||
have_compressed:
|
||||
|
||||
if (!mempool_initialized(&c->compression_bounce[READ])) {
|
||||
ret = mempool_init_page_pool(&c->compression_bounce[READ],
|
||||
1, order);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
|
||||
ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
|
||||
1, order);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (i = compression_types;
|
||||
i < compression_types + ARRAY_SIZE(compression_types);
|
||||
i++) {
|
||||
decompress_workspace_size =
|
||||
max(decompress_workspace_size, i->decompress_workspace);
|
||||
|
||||
if (!(features & (1 << i->feature)))
|
||||
continue;
|
||||
|
||||
if (i->decompress_workspace)
|
||||
decompress_workspace_needed = true;
|
||||
|
||||
if (mempool_initialized(&c->compress_workspace[i->type]))
|
||||
continue;
|
||||
|
||||
ret = mempool_init_kvpmalloc_pool(
|
||||
&c->compress_workspace[i->type],
|
||||
1, i->compress_workspace);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = mempool_init_kmalloc_pool(
|
||||
&c->decompress_workspace,
|
||||
1, decompress_workspace_size);
|
||||
if (ret)
|
||||
goto out;
|
||||
out:
|
||||
pr_verbose_init(c->opts, "ret %i", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_fs_compress_init(struct bch_fs *c)
|
||||
{
|
||||
u64 f = c->sb.features;
|
||||
|
||||
if (c->opts.compression)
|
||||
f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
|
||||
|
||||
if (c->opts.background_compression)
|
||||
f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
|
||||
|
||||
return __bch2_fs_compress_init(c, f);
|
||||
|
||||
}
|
18
fs/bcachefs/compress.h
Normal file
18
fs/bcachefs/compress.h
Normal file
@ -0,0 +1,18 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_COMPRESS_H
|
||||
#define _BCACHEFS_COMPRESS_H
|
||||
|
||||
#include "extents_types.h"
|
||||
|
||||
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
|
||||
struct bch_extent_crc_unpacked *);
|
||||
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
|
||||
struct bvec_iter, struct bch_extent_crc_unpacked);
|
||||
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
|
||||
struct bio *, size_t *, unsigned);
|
||||
|
||||
int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
|
||||
void bch2_fs_compress_exit(struct bch_fs *);
|
||||
int bch2_fs_compress_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_COMPRESS_H */
|
425
fs/bcachefs/debug.c
Normal file
425
fs/bcachefs/debug.c
Normal file
@ -0,0 +1,425 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Assorted bcachefs debug code
|
||||
*
|
||||
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
||||
* Copyright 2012 Google, Inc.
|
||||
*/
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_io.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_update.h"
|
||||
#include "buckets.h"
|
||||
#include "debug.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "fsck.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
#include "super.h"
|
||||
|
||||
#include <linux/console.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
static struct dentry *bch_debug;
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
|
||||
void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct btree *v = c->verify_data;
|
||||
struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
|
||||
struct bset *sorted, *inmemory;
|
||||
struct extent_pick_ptr pick;
|
||||
struct bch_dev *ca;
|
||||
struct bio *bio;
|
||||
|
||||
if (c->opts.nochanges)
|
||||
return;
|
||||
|
||||
btree_node_io_lock(b);
|
||||
mutex_lock(&c->verify_lock);
|
||||
|
||||
n_ondisk = c->verify_ondisk;
|
||||
n_sorted = c->verify_data->data;
|
||||
n_inmemory = b->data;
|
||||
|
||||
bkey_copy(&v->key, &b->key);
|
||||
v->written = 0;
|
||||
v->level = b->level;
|
||||
v->btree_id = b->btree_id;
|
||||
bch2_btree_keys_init(v, &c->expensive_debug_checks);
|
||||
|
||||
if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
|
||||
return;
|
||||
|
||||
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
||||
if (!bch2_dev_get_ioref(ca, READ))
|
||||
return;
|
||||
|
||||
bio = bio_alloc_bioset(ca->disk_sb.bdev,
|
||||
buf_pages(n_sorted, btree_bytes(c)),
|
||||
REQ_OP_READ|REQ_META,
|
||||
GFP_NOIO,
|
||||
&c->btree_bio);
|
||||
bio->bi_iter.bi_sector = pick.ptr.offset;
|
||||
bio->bi_iter.bi_size = btree_bytes(c);
|
||||
bch2_bio_map(bio, n_sorted);
|
||||
|
||||
submit_bio_wait(bio);
|
||||
|
||||
bio_put(bio);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
|
||||
memcpy(n_ondisk, n_sorted, btree_bytes(c));
|
||||
|
||||
if (bch2_btree_node_read_done(c, v, false))
|
||||
goto out;
|
||||
|
||||
n_sorted = c->verify_data->data;
|
||||
sorted = &n_sorted->keys;
|
||||
inmemory = &n_inmemory->keys;
|
||||
|
||||
if (inmemory->u64s != sorted->u64s ||
|
||||
memcmp(inmemory->start,
|
||||
sorted->start,
|
||||
vstruct_end(inmemory) - (void *) inmemory->start)) {
|
||||
unsigned offset = 0, sectors;
|
||||
struct bset *i;
|
||||
unsigned j;
|
||||
|
||||
console_lock();
|
||||
|
||||
printk(KERN_ERR "*** in memory:\n");
|
||||
bch2_dump_bset(b, inmemory, 0);
|
||||
|
||||
printk(KERN_ERR "*** read back in:\n");
|
||||
bch2_dump_bset(v, sorted, 0);
|
||||
|
||||
while (offset < b->written) {
|
||||
if (!offset ) {
|
||||
i = &n_ondisk->keys;
|
||||
sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
|
||||
c->block_bits;
|
||||
} else {
|
||||
struct btree_node_entry *bne =
|
||||
(void *) n_ondisk + (offset << 9);
|
||||
i = &bne->keys;
|
||||
|
||||
sectors = vstruct_blocks(bne, c->block_bits) <<
|
||||
c->block_bits;
|
||||
}
|
||||
|
||||
printk(KERN_ERR "*** on disk block %u:\n", offset);
|
||||
bch2_dump_bset(b, i, offset);
|
||||
|
||||
offset += sectors;
|
||||
}
|
||||
|
||||
printk(KERN_ERR "*** block %u/%u not written\n",
|
||||
offset >> c->block_bits, btree_blocks(c));
|
||||
|
||||
for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
|
||||
if (inmemory->_data[j] != sorted->_data[j])
|
||||
break;
|
||||
|
||||
printk(KERN_ERR "b->written %u\n", b->written);
|
||||
|
||||
console_unlock();
|
||||
panic("verify failed at %u\n", j);
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&c->verify_lock);
|
||||
btree_node_io_unlock(b);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
|
||||
/* XXX: bch_fs refcounting */
|
||||
|
||||
struct dump_iter {
|
||||
struct bpos from;
|
||||
struct bch_fs *c;
|
||||
enum btree_id id;
|
||||
|
||||
char buf[PAGE_SIZE];
|
||||
size_t bytes; /* what's currently in buf */
|
||||
|
||||
char __user *ubuf; /* destination user buffer */
|
||||
size_t size; /* size of requested read */
|
||||
ssize_t ret; /* bytes read so far */
|
||||
};
|
||||
|
||||
static int flush_buf(struct dump_iter *i)
|
||||
{
|
||||
if (i->bytes) {
|
||||
size_t bytes = min(i->bytes, i->size);
|
||||
int err = copy_to_user(i->ubuf, i->buf, bytes);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
i->ret += bytes;
|
||||
i->ubuf += bytes;
|
||||
i->size -= bytes;
|
||||
i->bytes -= bytes;
|
||||
memmove(i->buf, i->buf + bytes, i->bytes);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_dump_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct btree_debug *bd = inode->i_private;
|
||||
struct dump_iter *i;
|
||||
|
||||
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
|
||||
if (!i)
|
||||
return -ENOMEM;
|
||||
|
||||
file->private_data = i;
|
||||
i->from = POS_MIN;
|
||||
i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
|
||||
i->id = bd->id;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_dump_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
kfree(file->private_data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t bch2_read_btree(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos)
|
||||
{
|
||||
struct dump_iter *i = file->private_data;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int err;
|
||||
|
||||
i->ubuf = buf;
|
||||
i->size = size;
|
||||
i->ret = 0;
|
||||
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (!i->size)
|
||||
return i->ret;
|
||||
|
||||
bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
|
||||
k = bch2_btree_iter_peek(&iter);
|
||||
|
||||
while (k.k && !(err = btree_iter_err(k))) {
|
||||
bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
|
||||
i->buf, sizeof(i->buf), k);
|
||||
i->bytes = strlen(i->buf);
|
||||
BUG_ON(i->bytes >= PAGE_SIZE);
|
||||
i->buf[i->bytes] = '\n';
|
||||
i->bytes++;
|
||||
|
||||
k = bch2_btree_iter_next(&iter);
|
||||
i->from = iter.pos;
|
||||
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
if (!i->size)
|
||||
break;
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return err < 0 ? err : i->ret;
|
||||
}
|
||||
|
||||
static const struct file_operations btree_debug_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = bch2_dump_open,
|
||||
.release = bch2_dump_release,
|
||||
.read = bch2_read_btree,
|
||||
};
|
||||
|
||||
static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos)
|
||||
{
|
||||
struct dump_iter *i = file->private_data;
|
||||
struct btree_iter iter;
|
||||
struct btree *b;
|
||||
int err;
|
||||
|
||||
i->ubuf = buf;
|
||||
i->size = size;
|
||||
i->ret = 0;
|
||||
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (!i->size || !bkey_cmp(POS_MAX, i->from))
|
||||
return i->ret;
|
||||
|
||||
for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
|
||||
i->bytes = bch2_print_btree_node(i->c, b, i->buf,
|
||||
sizeof(i->buf));
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
/*
|
||||
* can't easily correctly restart a btree node traversal across
|
||||
* all nodes, meh
|
||||
*/
|
||||
i->from = bkey_cmp(POS_MAX, b->key.k.p)
|
||||
? bkey_successor(b->key.k.p)
|
||||
: b->key.k.p;
|
||||
|
||||
if (!i->size)
|
||||
break;
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return err < 0 ? err : i->ret;
|
||||
}
|
||||
|
||||
static const struct file_operations btree_format_debug_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = bch2_dump_open,
|
||||
.release = bch2_dump_release,
|
||||
.read = bch2_read_btree_formats,
|
||||
};
|
||||
|
||||
static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
|
||||
size_t size, loff_t *ppos)
|
||||
{
|
||||
struct dump_iter *i = file->private_data;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct btree *prev_node = NULL;
|
||||
int err;
|
||||
|
||||
i->ubuf = buf;
|
||||
i->size = size;
|
||||
i->ret = 0;
|
||||
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (!i->size)
|
||||
return i->ret;
|
||||
|
||||
bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
|
||||
|
||||
while ((k = bch2_btree_iter_peek(&iter)).k &&
|
||||
!(err = btree_iter_err(k))) {
|
||||
struct btree_iter_level *l = &iter.l[0];
|
||||
struct bkey_packed *_k =
|
||||
bch2_btree_node_iter_peek(&l->iter, l->b);
|
||||
|
||||
if (l->b != prev_node) {
|
||||
i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
|
||||
sizeof(i->buf));
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
prev_node = l->b;
|
||||
|
||||
i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
|
||||
sizeof(i->buf));
|
||||
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
bch2_btree_iter_next(&iter);
|
||||
i->from = iter.pos;
|
||||
|
||||
err = flush_buf(i);
|
||||
if (err)
|
||||
break;
|
||||
|
||||
if (!i->size)
|
||||
break;
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return err < 0 ? err : i->ret;
|
||||
}
|
||||
|
||||
static const struct file_operations bfloat_failed_debug_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = bch2_dump_open,
|
||||
.release = bch2_dump_release,
|
||||
.read = bch2_read_bfloat_failed,
|
||||
};
|
||||
|
||||
void bch2_fs_debug_exit(struct bch_fs *c)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(c->debug))
|
||||
debugfs_remove_recursive(c->debug);
|
||||
}
|
||||
|
||||
void bch2_fs_debug_init(struct bch_fs *c)
|
||||
{
|
||||
struct btree_debug *bd;
|
||||
char name[100];
|
||||
|
||||
if (IS_ERR_OR_NULL(bch_debug))
|
||||
return;
|
||||
|
||||
snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
|
||||
c->debug = debugfs_create_dir(name, bch_debug);
|
||||
if (IS_ERR_OR_NULL(c->debug))
|
||||
return;
|
||||
|
||||
for (bd = c->btree_debug;
|
||||
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
|
||||
bd++) {
|
||||
bd->id = bd - c->btree_debug;
|
||||
bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
|
||||
0400, c->debug, bd,
|
||||
&btree_debug_ops);
|
||||
|
||||
snprintf(name, sizeof(name), "%s-formats",
|
||||
bch2_btree_ids[bd->id]);
|
||||
|
||||
bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
|
||||
&btree_format_debug_ops);
|
||||
|
||||
snprintf(name, sizeof(name), "%s-bfloat-failed",
|
||||
bch2_btree_ids[bd->id]);
|
||||
|
||||
bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
|
||||
&bfloat_failed_debug_ops);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void bch2_debug_exit(void)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(bch_debug))
|
||||
debugfs_remove_recursive(bch_debug);
|
||||
}
|
||||
|
||||
int __init bch2_debug_init(void)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
bch_debug = debugfs_create_dir("bcachefs", NULL);
|
||||
return ret;
|
||||
}
|
63
fs/bcachefs/debug.h
Normal file
63
fs/bcachefs/debug.h
Normal file
@ -0,0 +1,63 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_DEBUG_H
|
||||
#define _BCACHEFS_DEBUG_H
|
||||
|
||||
#include "bcachefs.h"
|
||||
|
||||
struct bio;
|
||||
struct btree;
|
||||
struct bch_fs;
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
|
||||
BCH_DEBUG_PARAMS()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) \
|
||||
static inline bool name(struct bch_fs *c) \
|
||||
{ return bch2_##name || c->name; }
|
||||
BCH_DEBUG_PARAMS_ALWAYS()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) \
|
||||
static inline bool name(struct bch_fs *c) \
|
||||
{ return bch2_##name || c->name; }
|
||||
BCH_DEBUG_PARAMS_DEBUG()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
void __bch2_btree_verify(struct bch_fs *, struct btree *);
|
||||
|
||||
#define bypass_torture_test(d) ((d)->bypass_torture_test)
|
||||
|
||||
#else /* DEBUG */
|
||||
|
||||
#define BCH_DEBUG_PARAM(name, description) \
|
||||
static inline bool name(struct bch_fs *c) { return false; }
|
||||
BCH_DEBUG_PARAMS_DEBUG()
|
||||
#undef BCH_DEBUG_PARAM
|
||||
|
||||
static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
|
||||
|
||||
#define bypass_torture_test(d) 0
|
||||
|
||||
#endif
|
||||
|
||||
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
if (verify_btree_ondisk(c))
|
||||
__bch2_btree_verify(c, b);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
void bch2_fs_debug_exit(struct bch_fs *);
|
||||
void bch2_fs_debug_init(struct bch_fs *);
|
||||
#else
|
||||
static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
|
||||
static inline void bch2_fs_debug_init(struct bch_fs *c) {}
|
||||
#endif
|
||||
|
||||
void bch2_debug_exit(void);
|
||||
int bch2_debug_init(void);
|
||||
|
||||
#endif /* _BCACHEFS_DEBUG_H */
|
426
fs/bcachefs/dirent.c
Normal file
426
fs/bcachefs/dirent.c
Normal file
@ -0,0 +1,426 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_update.h"
|
||||
#include "extents.h"
|
||||
#include "dirent.h"
|
||||
#include "fs.h"
|
||||
#include "keylist.h"
|
||||
#include "str_hash.h"
|
||||
|
||||
#include <linux/dcache.h>
|
||||
|
||||
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
|
||||
{
|
||||
unsigned len = bkey_val_bytes(d.k) -
|
||||
offsetof(struct bch_dirent, d_name);
|
||||
|
||||
while (len && !d.v->d_name[len - 1])
|
||||
--len;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static unsigned dirent_val_u64s(unsigned len)
|
||||
{
|
||||
return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
|
||||
sizeof(u64));
|
||||
}
|
||||
|
||||
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
|
||||
const struct qstr *name)
|
||||
{
|
||||
struct bch_str_hash_ctx ctx;
|
||||
|
||||
bch2_str_hash_init(&ctx, info);
|
||||
bch2_str_hash_update(&ctx, info, name->name, name->len);
|
||||
|
||||
/* [0,2) reserved for dots */
|
||||
return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
|
||||
}
|
||||
|
||||
static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
|
||||
{
|
||||
return bch2_dirent_hash(info, key);
|
||||
}
|
||||
|
||||
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
||||
struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
|
||||
|
||||
return bch2_dirent_hash(info, &name);
|
||||
}
|
||||
|
||||
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
|
||||
{
|
||||
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
||||
int len = bch2_dirent_name_bytes(l);
|
||||
const struct qstr *r = _r;
|
||||
|
||||
return len - r->len ?: memcmp(l.v->d_name, r->name, len);
|
||||
}
|
||||
|
||||
static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
||||
{
|
||||
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
||||
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
|
||||
int l_len = bch2_dirent_name_bytes(l);
|
||||
int r_len = bch2_dirent_name_bytes(r);
|
||||
|
||||
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
|
||||
}
|
||||
|
||||
const struct bch_hash_desc bch2_dirent_hash_desc = {
|
||||
.btree_id = BTREE_ID_DIRENTS,
|
||||
.key_type = BCH_DIRENT,
|
||||
.whiteout_type = BCH_DIRENT_WHITEOUT,
|
||||
.hash_key = dirent_hash_key,
|
||||
.hash_bkey = dirent_hash_bkey,
|
||||
.cmp_key = dirent_cmp_key,
|
||||
.cmp_bkey = dirent_cmp_bkey,
|
||||
};
|
||||
|
||||
const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_dirent d;
|
||||
unsigned len;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_DIRENT:
|
||||
if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
|
||||
return "value too small";
|
||||
|
||||
d = bkey_s_c_to_dirent(k);
|
||||
len = bch2_dirent_name_bytes(d);
|
||||
|
||||
if (!len)
|
||||
return "empty name";
|
||||
|
||||
/*
|
||||
* older versions of bcachefs were buggy and creating dirent
|
||||
* keys that were bigger than necessary:
|
||||
*/
|
||||
if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
|
||||
return "value too big";
|
||||
|
||||
if (len > BCH_NAME_MAX)
|
||||
return "dirent name too big";
|
||||
|
||||
if (memchr(d.v->d_name, '/', len))
|
||||
return "dirent name has invalid characters";
|
||||
|
||||
return NULL;
|
||||
case BCH_DIRENT_WHITEOUT:
|
||||
return bkey_val_bytes(k.k) != 0
|
||||
? "value size should be zero"
|
||||
: NULL;
|
||||
|
||||
default:
|
||||
return "invalid type";
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_dirent_to_text(struct bch_fs *c, char *buf,
|
||||
size_t size, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_dirent d;
|
||||
size_t n = 0;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_DIRENT:
|
||||
d = bkey_s_c_to_dirent(k);
|
||||
|
||||
n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
|
||||
bch2_dirent_name_bytes(d));
|
||||
n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
|
||||
break;
|
||||
case BCH_DIRENT_WHITEOUT:
|
||||
scnprintf(buf, size, "whiteout");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
||||
u8 type, const struct qstr *name, u64 dst)
|
||||
{
|
||||
struct bkey_i_dirent *dirent;
|
||||
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
|
||||
|
||||
if (name->len > BCH_NAME_MAX)
|
||||
return ERR_PTR(-ENAMETOOLONG);
|
||||
|
||||
BUG_ON(u64s > U8_MAX);
|
||||
|
||||
dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
|
||||
if (IS_ERR(dirent))
|
||||
return dirent;
|
||||
|
||||
bkey_dirent_init(&dirent->k_i);
|
||||
dirent->k.u64s = u64s;
|
||||
dirent->v.d_inum = cpu_to_le64(dst);
|
||||
dirent->v.d_type = type;
|
||||
|
||||
memcpy(dirent->v.d_name, name->name, name->len);
|
||||
memset(dirent->v.d_name + name->len, 0,
|
||||
bkey_val_bytes(&dirent->k) -
|
||||
offsetof(struct bch_dirent, d_name) -
|
||||
name->len);
|
||||
|
||||
EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
|
||||
|
||||
return dirent;
|
||||
}
|
||||
|
||||
int __bch2_dirent_create(struct btree_trans *trans,
|
||||
u64 dir_inum, const struct bch_hash_info *hash_info,
|
||||
u8 type, const struct qstr *name, u64 dst_inum,
|
||||
int flags)
|
||||
{
|
||||
struct bkey_i_dirent *dirent;
|
||||
int ret;
|
||||
|
||||
dirent = dirent_create_key(trans, type, name, dst_inum);
|
||||
ret = PTR_ERR_OR_ZERO(dirent);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
|
||||
dir_inum, &dirent->k_i, flags);
|
||||
}
|
||||
|
||||
int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
|
||||
const struct bch_hash_info *hash_info,
|
||||
u8 type, const struct qstr *name, u64 dst_inum,
|
||||
u64 *journal_seq, int flags)
|
||||
{
|
||||
return bch2_trans_do(c, journal_seq, flags,
|
||||
__bch2_dirent_create(&trans, dir_inum, hash_info,
|
||||
type, name, dst_inum, flags));
|
||||
}
|
||||
|
||||
static void dirent_copy_target(struct bkey_i_dirent *dst,
|
||||
struct bkey_s_c_dirent src)
|
||||
{
|
||||
dst->v.d_inum = src.v->d_inum;
|
||||
dst->v.d_type = src.v->d_type;
|
||||
}
|
||||
|
||||
static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
|
||||
const struct qstr *name)
|
||||
{
|
||||
return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
|
||||
}
|
||||
|
||||
int bch2_dirent_rename(struct btree_trans *trans,
|
||||
struct bch_inode_info *src_dir, const struct qstr *src_name,
|
||||
struct bch_inode_info *dst_dir, const struct qstr *dst_name,
|
||||
enum bch_rename_mode mode)
|
||||
{
|
||||
struct btree_iter *src_iter, *dst_iter;
|
||||
struct bkey_s_c old_src, old_dst;
|
||||
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
|
||||
struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Lookup dst:
|
||||
*
|
||||
* Note that in BCH_RENAME mode, we're _not_ checking if
|
||||
* the target already exists - we're relying on the VFS
|
||||
* to do that check for us for correctness:
|
||||
*/
|
||||
dst_iter = mode == BCH_RENAME
|
||||
? bch2_hash_hole(trans, bch2_dirent_hash_desc,
|
||||
&dst_dir->ei_str_hash,
|
||||
dst_dir->v.i_ino, dst_name)
|
||||
: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
|
||||
&dst_dir->ei_str_hash,
|
||||
dst_dir->v.i_ino, dst_name,
|
||||
BTREE_ITER_INTENT);
|
||||
if (IS_ERR(dst_iter))
|
||||
return PTR_ERR(dst_iter);
|
||||
old_dst = bch2_btree_iter_peek_slot(dst_iter);
|
||||
|
||||
/* Lookup src: */
|
||||
src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
|
||||
&src_dir->ei_str_hash,
|
||||
src_dir->v.i_ino, src_name,
|
||||
BTREE_ITER_INTENT);
|
||||
if (IS_ERR(src_iter))
|
||||
return PTR_ERR(src_iter);
|
||||
old_src = bch2_btree_iter_peek_slot(src_iter);
|
||||
|
||||
/* Create new dst key: */
|
||||
new_dst = dirent_create_key(trans, 0, dst_name, 0);
|
||||
if (IS_ERR(new_dst))
|
||||
return PTR_ERR(new_dst);
|
||||
|
||||
dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
|
||||
new_dst->k.p = dst_iter->pos;
|
||||
|
||||
/* Create new src key: */
|
||||
if (mode == BCH_RENAME_EXCHANGE) {
|
||||
new_src = dirent_create_key(trans, 0, src_name, 0);
|
||||
if (IS_ERR(new_src))
|
||||
return PTR_ERR(new_src);
|
||||
|
||||
dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
|
||||
new_src->k.p = src_iter->pos;
|
||||
} else {
|
||||
new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
|
||||
if (IS_ERR(new_src))
|
||||
return PTR_ERR(new_src);
|
||||
bkey_init(&new_src->k);
|
||||
new_src->k.p = src_iter->pos;
|
||||
|
||||
if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
|
||||
bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
|
||||
/*
|
||||
* We have a hash collision for the new dst key,
|
||||
* and new_src - the key we're deleting - is between
|
||||
* new_dst's hashed slot and the slot we're going to be
|
||||
* inserting it into - oops. This will break the hash
|
||||
* table if we don't deal with it:
|
||||
*/
|
||||
if (mode == BCH_RENAME) {
|
||||
/*
|
||||
* If we're not overwriting, we can just insert
|
||||
* new_dst at the src position:
|
||||
*/
|
||||
new_dst->k.p = src_iter->pos;
|
||||
bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
|
||||
return 0;
|
||||
} else {
|
||||
/* If we're overwriting, we can't insert new_dst
|
||||
* at a different slot because it has to
|
||||
* overwrite old_dst - just make sure to use a
|
||||
* whiteout when deleting src:
|
||||
*/
|
||||
new_src->k.type = BCH_DIRENT_WHITEOUT;
|
||||
}
|
||||
} else {
|
||||
/* Check if we need a whiteout to delete src: */
|
||||
ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
|
||||
&src_dir->ei_str_hash,
|
||||
src_iter);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (ret)
|
||||
new_src->k.type = BCH_DIRENT_WHITEOUT;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
|
||||
bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
|
||||
const struct bch_hash_info *hash_info,
|
||||
const struct qstr *name)
|
||||
{
|
||||
return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
|
||||
dir_inum, name);
|
||||
}
|
||||
|
||||
int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
|
||||
const struct bch_hash_info *hash_info,
|
||||
const struct qstr *name,
|
||||
u64 *journal_seq)
|
||||
{
|
||||
return bch2_trans_do(c, journal_seq,
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL,
|
||||
__bch2_dirent_delete(&trans, dir_inum, hash_info, name));
|
||||
}
|
||||
|
||||
u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
|
||||
const struct bch_hash_info *hash_info,
|
||||
const struct qstr *name)
|
||||
{
|
||||
struct btree_trans trans;
|
||||
struct btree_iter *iter;
|
||||
struct bkey_s_c k;
|
||||
u64 inum = 0;
|
||||
|
||||
bch2_trans_init(&trans, c);
|
||||
|
||||
iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
|
||||
hash_info, dir_inum, name, 0);
|
||||
if (IS_ERR(iter)) {
|
||||
BUG_ON(PTR_ERR(iter) == -EINTR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
k = bch2_btree_iter_peek_slot(iter);
|
||||
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
|
||||
out:
|
||||
bch2_trans_exit(&trans);
|
||||
return inum;
|
||||
}
|
||||
|
||||
int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
|
||||
if (k.k->p.inode > dir_inum)
|
||||
break;
|
||||
|
||||
if (k.k->type == BCH_DIRENT) {
|
||||
ret = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_readdir(struct bch_fs *c, struct file *file,
|
||||
struct dir_context *ctx)
|
||||
{
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bkey_s_c_dirent dirent;
|
||||
unsigned len;
|
||||
|
||||
if (!dir_emit_dots(file, ctx))
|
||||
return 0;
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
|
||||
POS(inode->v.i_ino, ctx->pos), 0, k) {
|
||||
if (k.k->type != BCH_DIRENT)
|
||||
continue;
|
||||
|
||||
dirent = bkey_s_c_to_dirent(k);
|
||||
|
||||
if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
|
||||
continue;
|
||||
|
||||
if (k.k->p.inode > inode->v.i_ino)
|
||||
break;
|
||||
|
||||
len = bch2_dirent_name_bytes(dirent);
|
||||
|
||||
/*
|
||||
* XXX: dir_emit() can fault and block, while we're holding
|
||||
* locks
|
||||
*/
|
||||
if (!dir_emit(ctx, dirent.v->d_name, len,
|
||||
le64_to_cpu(dirent.v->d_inum),
|
||||
dirent.v->d_type))
|
||||
break;
|
||||
|
||||
ctx->pos = k.k->p.offset + 1;
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
return 0;
|
||||
}
|
55
fs/bcachefs/dirent.h
Normal file
55
fs/bcachefs/dirent.h
Normal file
@ -0,0 +1,55 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_DIRENT_H
|
||||
#define _BCACHEFS_DIRENT_H
|
||||
|
||||
#include "str_hash.h"
|
||||
|
||||
extern const struct bch_hash_desc bch2_dirent_hash_desc;
|
||||
|
||||
const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
|
||||
|
||||
#define bch2_bkey_dirent_ops (struct bkey_ops) { \
|
||||
.key_invalid = bch2_dirent_invalid, \
|
||||
.val_to_text = bch2_dirent_to_text, \
|
||||
}
|
||||
|
||||
struct qstr;
|
||||
struct file;
|
||||
struct dir_context;
|
||||
struct bch_fs;
|
||||
struct bch_hash_info;
|
||||
struct bch_inode_info;
|
||||
|
||||
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
|
||||
|
||||
int __bch2_dirent_create(struct btree_trans *, u64,
|
||||
const struct bch_hash_info *, u8,
|
||||
const struct qstr *, u64, int);
|
||||
int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
|
||||
u8, const struct qstr *, u64, u64 *, int);
|
||||
|
||||
int __bch2_dirent_delete(struct btree_trans *, u64,
|
||||
const struct bch_hash_info *,
|
||||
const struct qstr *);
|
||||
int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
|
||||
const struct qstr *, u64 *);
|
||||
|
||||
enum bch_rename_mode {
|
||||
BCH_RENAME,
|
||||
BCH_RENAME_OVERWRITE,
|
||||
BCH_RENAME_EXCHANGE,
|
||||
};
|
||||
|
||||
int bch2_dirent_rename(struct btree_trans *,
|
||||
struct bch_inode_info *, const struct qstr *,
|
||||
struct bch_inode_info *, const struct qstr *,
|
||||
enum bch_rename_mode);
|
||||
|
||||
u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
|
||||
const struct qstr *);
|
||||
|
||||
int bch2_empty_dir(struct bch_fs *, u64);
|
||||
int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
|
||||
|
||||
#endif /* _BCACHEFS_DIRENT_H */
|
494
fs/bcachefs/disk_groups.c
Normal file
494
fs/bcachefs/disk_groups.c
Normal file
@ -0,0 +1,494 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "disk_groups.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/sort.h>
|
||||
|
||||
static int group_cmp(const void *_l, const void *_r)
|
||||
{
|
||||
const struct bch_disk_group *l = _l;
|
||||
const struct bch_disk_group *r = _r;
|
||||
|
||||
return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
|
||||
(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
|
||||
((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
|
||||
(BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
|
||||
strncmp(l->label, r->label, sizeof(l->label));
|
||||
}
|
||||
|
||||
static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_disk_groups *groups =
|
||||
field_to_type(f, disk_groups);
|
||||
struct bch_disk_group *g, *sorted = NULL;
|
||||
struct bch_sb_field_members *mi;
|
||||
struct bch_member *m;
|
||||
unsigned i, nr_groups, len;
|
||||
const char *err = NULL;
|
||||
|
||||
mi = bch2_sb_get_members(sb);
|
||||
groups = bch2_sb_get_disk_groups(sb);
|
||||
nr_groups = disk_groups_nr(groups);
|
||||
|
||||
for (m = mi->members;
|
||||
m < mi->members + sb->nr_devices;
|
||||
m++) {
|
||||
unsigned g;
|
||||
|
||||
if (!BCH_MEMBER_GROUP(m))
|
||||
continue;
|
||||
|
||||
g = BCH_MEMBER_GROUP(m) - 1;
|
||||
|
||||
if (g >= nr_groups ||
|
||||
BCH_GROUP_DELETED(&groups->entries[g]))
|
||||
return "disk has invalid group";
|
||||
}
|
||||
|
||||
if (!nr_groups)
|
||||
return NULL;
|
||||
|
||||
for (g = groups->entries;
|
||||
g < groups->entries + nr_groups;
|
||||
g++) {
|
||||
if (BCH_GROUP_DELETED(g))
|
||||
continue;
|
||||
|
||||
len = strnlen(g->label, sizeof(g->label));
|
||||
if (!len) {
|
||||
err = "group with empty label";
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
|
||||
if (!sorted)
|
||||
return "cannot allocate memory";
|
||||
|
||||
memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
|
||||
sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
|
||||
|
||||
for (i = 0; i + 1 < nr_groups; i++)
|
||||
if (!BCH_GROUP_DELETED(sorted + i) &&
|
||||
!group_cmp(sorted + i, sorted + i + 1)) {
|
||||
err = "duplicate groups";
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = NULL;
|
||||
err:
|
||||
kfree(sorted);
|
||||
return err;
|
||||
}
|
||||
|
||||
static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
|
||||
struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
char *out = buf, *end = buf + size;
|
||||
struct bch_sb_field_disk_groups *groups =
|
||||
field_to_type(f, disk_groups);
|
||||
struct bch_disk_group *g;
|
||||
unsigned nr_groups = disk_groups_nr(groups);
|
||||
|
||||
for (g = groups->entries;
|
||||
g < groups->entries + nr_groups;
|
||||
g++) {
|
||||
if (g != groups->entries)
|
||||
out += scnprintf(out, end - out, " ");
|
||||
|
||||
if (BCH_GROUP_DELETED(g))
|
||||
out += scnprintf(out, end - out, "[deleted]");
|
||||
else
|
||||
out += scnprintf(out, end - out,
|
||||
"[parent %llu name %s]",
|
||||
BCH_GROUP_PARENT(g),
|
||||
g->label);
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
|
||||
.validate = bch2_sb_disk_groups_validate,
|
||||
.to_text = bch2_sb_disk_groups_to_text
|
||||
};
|
||||
|
||||
int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_members *mi;
|
||||
struct bch_sb_field_disk_groups *groups;
|
||||
struct bch_disk_groups_cpu *cpu_g, *old_g;
|
||||
unsigned i, g, nr_groups;
|
||||
|
||||
lockdep_assert_held(&c->sb_lock);
|
||||
|
||||
mi = bch2_sb_get_members(c->disk_sb.sb);
|
||||
groups = bch2_sb_get_disk_groups(c->disk_sb.sb);
|
||||
nr_groups = disk_groups_nr(groups);
|
||||
|
||||
if (!groups)
|
||||
return 0;
|
||||
|
||||
cpu_g = kzalloc(sizeof(*cpu_g) +
|
||||
sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
|
||||
if (!cpu_g)
|
||||
return -ENOMEM;
|
||||
|
||||
cpu_g->nr = nr_groups;
|
||||
|
||||
for (i = 0; i < nr_groups; i++) {
|
||||
struct bch_disk_group *src = &groups->entries[i];
|
||||
struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
|
||||
|
||||
dst->deleted = BCH_GROUP_DELETED(src);
|
||||
dst->parent = BCH_GROUP_PARENT(src);
|
||||
}
|
||||
|
||||
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
|
||||
struct bch_member *m = mi->members + i;
|
||||
struct bch_disk_group_cpu *dst =
|
||||
&cpu_g->entries[BCH_MEMBER_GROUP(m)];
|
||||
|
||||
if (!bch2_member_exists(m))
|
||||
continue;
|
||||
|
||||
g = BCH_MEMBER_GROUP(m);
|
||||
while (g) {
|
||||
dst = &cpu_g->entries[g - 1];
|
||||
__set_bit(i, dst->devs.d);
|
||||
g = dst->parent;
|
||||
}
|
||||
}
|
||||
|
||||
old_g = rcu_dereference_protected(c->disk_groups,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
rcu_assign_pointer(c->disk_groups, cpu_g);
|
||||
if (old_g)
|
||||
kfree_rcu(old_g, rcu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
|
||||
{
|
||||
struct target t = target_decode(target);
|
||||
|
||||
switch (t.type) {
|
||||
case TARGET_NULL:
|
||||
return NULL;
|
||||
case TARGET_DEV: {
|
||||
struct bch_dev *ca = t.dev < c->sb.nr_devices
|
||||
? rcu_dereference(c->devs[t.dev])
|
||||
: NULL;
|
||||
return ca ? &ca->self : NULL;
|
||||
}
|
||||
case TARGET_GROUP: {
|
||||
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
|
||||
|
||||
return t.group < g->nr && !g->entries[t.group].deleted
|
||||
? &g->entries[t.group].devs
|
||||
: NULL;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
|
||||
{
|
||||
struct target t = target_decode(target);
|
||||
|
||||
switch (t.type) {
|
||||
case TARGET_NULL:
|
||||
return false;
|
||||
case TARGET_DEV:
|
||||
return dev == t.dev;
|
||||
case TARGET_GROUP: {
|
||||
struct bch_disk_groups_cpu *g;
|
||||
const struct bch_devs_mask *m;
|
||||
bool ret;
|
||||
|
||||
rcu_read_lock();
|
||||
g = rcu_dereference(c->disk_groups);
|
||||
m = t.group < g->nr && !g->entries[t.group].deleted
|
||||
? &g->entries[t.group].devs
|
||||
: NULL;
|
||||
|
||||
ret = m ? test_bit(dev, m->d) : false;
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
|
||||
unsigned parent,
|
||||
const char *name, unsigned namelen)
|
||||
{
|
||||
unsigned i, nr_groups = disk_groups_nr(groups);
|
||||
|
||||
if (!namelen || namelen > BCH_SB_LABEL_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < nr_groups; i++) {
|
||||
struct bch_disk_group *g = groups->entries + i;
|
||||
|
||||
if (BCH_GROUP_DELETED(g))
|
||||
continue;
|
||||
|
||||
if (!BCH_GROUP_DELETED(g) &&
|
||||
BCH_GROUP_PARENT(g) == parent &&
|
||||
strnlen(g->label, sizeof(g->label)) == namelen &&
|
||||
!memcmp(name, g->label, namelen))
|
||||
return i;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
|
||||
const char *name, unsigned namelen)
|
||||
{
|
||||
struct bch_sb_field_disk_groups *groups =
|
||||
bch2_sb_get_disk_groups(sb->sb);
|
||||
unsigned i, nr_groups = disk_groups_nr(groups);
|
||||
struct bch_disk_group *g;
|
||||
|
||||
if (!namelen || namelen > BCH_SB_LABEL_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0;
|
||||
i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
|
||||
i++)
|
||||
;
|
||||
|
||||
if (i == nr_groups) {
|
||||
unsigned u64s =
|
||||
(sizeof(struct bch_sb_field_disk_groups) +
|
||||
sizeof(struct bch_disk_group) * (nr_groups + 1)) /
|
||||
sizeof(u64);
|
||||
|
||||
groups = bch2_sb_resize_disk_groups(sb, u64s);
|
||||
if (!groups)
|
||||
return -ENOSPC;
|
||||
|
||||
nr_groups = disk_groups_nr(groups);
|
||||
}
|
||||
|
||||
BUG_ON(i >= nr_groups);
|
||||
|
||||
g = &groups->entries[i];
|
||||
|
||||
memcpy(g->label, name, namelen);
|
||||
if (namelen < sizeof(g->label))
|
||||
g->label[namelen] = '\0';
|
||||
SET_BCH_GROUP_DELETED(g, 0);
|
||||
SET_BCH_GROUP_PARENT(g, parent);
|
||||
SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
|
||||
{
|
||||
struct bch_sb_field_disk_groups *groups =
|
||||
bch2_sb_get_disk_groups(sb->sb);
|
||||
int v = -1;
|
||||
|
||||
do {
|
||||
const char *next = strchrnul(name, '.');
|
||||
unsigned len = next - name;
|
||||
|
||||
if (*next == '.')
|
||||
next++;
|
||||
|
||||
v = __bch2_disk_group_find(groups, v + 1, name, len);
|
||||
name = next;
|
||||
} while (*name && v >= 0);
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
|
||||
{
|
||||
struct bch_sb_field_disk_groups *groups;
|
||||
unsigned parent = 0;
|
||||
int v = -1;
|
||||
|
||||
do {
|
||||
const char *next = strchrnul(name, '.');
|
||||
unsigned len = next - name;
|
||||
|
||||
if (*next == '.')
|
||||
next++;
|
||||
|
||||
groups = bch2_sb_get_disk_groups(sb->sb);
|
||||
|
||||
v = __bch2_disk_group_find(groups, parent, name, len);
|
||||
if (v < 0)
|
||||
v = __bch2_disk_group_add(sb, parent, name, len);
|
||||
if (v < 0)
|
||||
return v;
|
||||
|
||||
parent = v + 1;
|
||||
name = next;
|
||||
} while (*name && v >= 0);
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
int bch2_disk_path_print(struct bch_sb_handle *sb,
|
||||
char *buf, size_t len, unsigned v)
|
||||
{
|
||||
char *out = buf, *end = out + len;
|
||||
struct bch_sb_field_disk_groups *groups =
|
||||
bch2_sb_get_disk_groups(sb->sb);
|
||||
struct bch_disk_group *g;
|
||||
unsigned nr = 0;
|
||||
u16 path[32];
|
||||
|
||||
while (1) {
|
||||
if (nr == ARRAY_SIZE(path))
|
||||
goto inval;
|
||||
|
||||
if (v >= disk_groups_nr(groups))
|
||||
goto inval;
|
||||
|
||||
g = groups->entries + v;
|
||||
|
||||
if (BCH_GROUP_DELETED(g))
|
||||
goto inval;
|
||||
|
||||
path[nr++] = v;
|
||||
|
||||
if (!BCH_GROUP_PARENT(g))
|
||||
break;
|
||||
|
||||
v = BCH_GROUP_PARENT(g) - 1;
|
||||
}
|
||||
|
||||
while (nr) {
|
||||
unsigned b = 0;
|
||||
|
||||
v = path[--nr];
|
||||
g = groups->entries + v;
|
||||
|
||||
if (end != out)
|
||||
b = min_t(size_t, end - out,
|
||||
strnlen(g->label, sizeof(g->label)));
|
||||
memcpy(out, g->label, b);
|
||||
if (b < end - out)
|
||||
out[b] = '\0';
|
||||
out += b;
|
||||
|
||||
if (nr)
|
||||
out += scnprintf(out, end - out, ".");
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
inval:
|
||||
return scnprintf(buf, len, "invalid group %u", v);
|
||||
}
|
||||
|
||||
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
|
||||
{
|
||||
struct bch_member *mi;
|
||||
int v = -1;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
if (!strlen(name) || !strcmp(name, "none"))
|
||||
goto write_sb;
|
||||
|
||||
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
|
||||
if (v < 0) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return v;
|
||||
}
|
||||
|
||||
write_sb:
|
||||
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
|
||||
SET_BCH_MEMBER_GROUP(mi, v + 1);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
int g;
|
||||
|
||||
if (!strlen(buf) || !strcmp(buf, "none")) {
|
||||
*v = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Is it a device? */
|
||||
ca = bch2_dev_lookup(c, buf);
|
||||
if (!IS_ERR(ca)) {
|
||||
*v = dev_to_target(ca->dev_idx);
|
||||
percpu_ref_put(&ca->ref);
|
||||
return 0;
|
||||
}
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
g = bch2_disk_path_find(&c->disk_sb, buf);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
if (g >= 0) {
|
||||
*v = group_to_target(g);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
|
||||
{
|
||||
struct target t = target_decode(v);
|
||||
int ret;
|
||||
|
||||
switch (t.type) {
|
||||
case TARGET_NULL:
|
||||
return scnprintf(buf, len, "none");
|
||||
case TARGET_DEV: {
|
||||
struct bch_dev *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
ca = t.dev < c->sb.nr_devices
|
||||
? rcu_dereference(c->devs[t.dev])
|
||||
: NULL;
|
||||
|
||||
if (ca && percpu_ref_tryget(&ca->io_ref)) {
|
||||
ret = scnprintf(buf, len, "/dev/%pg",
|
||||
ca->disk_sb.bdev);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
} else if (ca) {
|
||||
ret = scnprintf(buf, len, "offline device %u", t.dev);
|
||||
} else {
|
||||
ret = scnprintf(buf, len, "invalid device %u", t.dev);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
break;
|
||||
}
|
||||
case TARGET_GROUP:
|
||||
mutex_lock(&c->sb_lock);
|
||||
ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
74
fs/bcachefs/disk_groups.h
Normal file
74
fs/bcachefs/disk_groups.h
Normal file
@ -0,0 +1,74 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_DISK_GROUPS_H
|
||||
#define _BCACHEFS_DISK_GROUPS_H
|
||||
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
|
||||
|
||||
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
|
||||
{
|
||||
return groups
|
||||
? (vstruct_end(&groups->field) -
|
||||
(void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
|
||||
: 0;
|
||||
}
|
||||
|
||||
struct target {
|
||||
enum {
|
||||
TARGET_NULL,
|
||||
TARGET_DEV,
|
||||
TARGET_GROUP,
|
||||
} type;
|
||||
union {
|
||||
unsigned dev;
|
||||
unsigned group;
|
||||
};
|
||||
};
|
||||
|
||||
#define TARGET_DEV_START 1
|
||||
#define TARGET_GROUP_START (256 + TARGET_DEV_START)
|
||||
|
||||
static inline u16 dev_to_target(unsigned dev)
|
||||
{
|
||||
return TARGET_DEV_START + dev;
|
||||
}
|
||||
|
||||
static inline u16 group_to_target(unsigned group)
|
||||
{
|
||||
return TARGET_GROUP_START + group;
|
||||
}
|
||||
|
||||
static inline struct target target_decode(unsigned target)
|
||||
{
|
||||
if (target >= TARGET_GROUP_START)
|
||||
return (struct target) {
|
||||
.type = TARGET_GROUP,
|
||||
.group = target - TARGET_GROUP_START
|
||||
};
|
||||
|
||||
if (target >= TARGET_DEV_START)
|
||||
return (struct target) {
|
||||
.type = TARGET_DEV,
|
||||
.group = target - TARGET_DEV_START
|
||||
};
|
||||
|
||||
return (struct target) { .type = TARGET_NULL };
|
||||
}
|
||||
|
||||
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
|
||||
bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
|
||||
|
||||
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
|
||||
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
|
||||
int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
|
||||
|
||||
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
|
||||
int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
|
||||
|
||||
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
|
||||
|
||||
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
|
||||
|
||||
const char *bch2_sb_validate_disk_groups(struct bch_sb *,
|
||||
struct bch_sb_field *);
|
||||
|
||||
#endif /* _BCACHEFS_DISK_GROUPS_H */
|
159
fs/bcachefs/error.c
Normal file
159
fs/bcachefs/error.c
Normal file
@ -0,0 +1,159 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "error.h"
|
||||
#include "io.h"
|
||||
#include "super.h"
|
||||
|
||||
bool bch2_inconsistent_error(struct bch_fs *c)
|
||||
{
|
||||
set_bit(BCH_FS_ERROR, &c->flags);
|
||||
|
||||
switch (c->opts.errors) {
|
||||
case BCH_ON_ERROR_CONTINUE:
|
||||
return false;
|
||||
case BCH_ON_ERROR_RO:
|
||||
if (bch2_fs_emergency_read_only(c))
|
||||
bch_err(c, "emergency read only");
|
||||
return true;
|
||||
case BCH_ON_ERROR_PANIC:
|
||||
panic(bch2_fmt(c, "panic after error"));
|
||||
return true;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_fatal_error(struct bch_fs *c)
|
||||
{
|
||||
if (bch2_fs_emergency_read_only(c))
|
||||
bch_err(c, "emergency read only");
|
||||
}
|
||||
|
||||
void bch2_io_error_work(struct work_struct *work)
|
||||
{
|
||||
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
|
||||
struct bch_fs *c = ca->fs;
|
||||
bool dev;
|
||||
|
||||
mutex_lock(&c->state_lock);
|
||||
dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
|
||||
BCH_FORCE_IF_DEGRADED);
|
||||
if (dev
|
||||
? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
|
||||
BCH_FORCE_IF_DEGRADED)
|
||||
: bch2_fs_emergency_read_only(c))
|
||||
bch_err(ca,
|
||||
"too many IO errors, setting %s RO",
|
||||
dev ? "device" : "filesystem");
|
||||
mutex_unlock(&c->state_lock);
|
||||
}
|
||||
|
||||
void bch2_io_error(struct bch_dev *ca)
|
||||
{
|
||||
//queue_work(system_long_wq, &ca->io_error_work);
|
||||
}
|
||||
|
||||
#ifdef __KERNEL__
|
||||
#define ask_yn() false
|
||||
#else
|
||||
#include "tools-util.h"
|
||||
#endif
|
||||
|
||||
enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
|
||||
const char *fmt, ...)
|
||||
{
|
||||
struct fsck_err_state *s;
|
||||
va_list args;
|
||||
bool fix = false, print = true, suppressing = false;
|
||||
char _buf[sizeof(s->buf)], *buf = _buf;
|
||||
|
||||
mutex_lock(&c->fsck_error_lock);
|
||||
|
||||
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
|
||||
goto print;
|
||||
|
||||
list_for_each_entry(s, &c->fsck_errors, list)
|
||||
if (s->fmt == fmt)
|
||||
goto found;
|
||||
|
||||
s = kzalloc(sizeof(*s), GFP_KERNEL);
|
||||
if (!s) {
|
||||
if (!c->fsck_alloc_err)
|
||||
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
|
||||
c->fsck_alloc_err = true;
|
||||
buf = _buf;
|
||||
goto print;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&s->list);
|
||||
s->fmt = fmt;
|
||||
found:
|
||||
list_move(&s->list, &c->fsck_errors);
|
||||
s->nr++;
|
||||
suppressing = s->nr == 10;
|
||||
print = s->nr <= 10;
|
||||
buf = s->buf;
|
||||
print:
|
||||
va_start(args, fmt);
|
||||
vscnprintf(buf, sizeof(_buf), fmt, args);
|
||||
va_end(args);
|
||||
|
||||
if (c->opts.fix_errors == FSCK_OPT_EXIT) {
|
||||
bch_err(c, "%s, exiting", buf);
|
||||
mutex_unlock(&c->fsck_error_lock);
|
||||
return FSCK_ERR_EXIT;
|
||||
}
|
||||
|
||||
if (flags & FSCK_CAN_FIX) {
|
||||
if (c->opts.fix_errors == FSCK_OPT_ASK) {
|
||||
printk(KERN_ERR "%s: fix?", buf);
|
||||
fix = ask_yn();
|
||||
} else if (c->opts.fix_errors == FSCK_OPT_YES ||
|
||||
(c->opts.nochanges &&
|
||||
!(flags & FSCK_CAN_IGNORE))) {
|
||||
if (print)
|
||||
bch_err(c, "%s, fixing", buf);
|
||||
fix = true;
|
||||
} else {
|
||||
if (print)
|
||||
bch_err(c, "%s, not fixing", buf);
|
||||
fix = false;
|
||||
}
|
||||
} else if (flags & FSCK_NEED_FSCK) {
|
||||
if (print)
|
||||
bch_err(c, "%s (run fsck to correct)", buf);
|
||||
} else {
|
||||
if (print)
|
||||
bch_err(c, "%s (repair unimplemented)", buf);
|
||||
}
|
||||
|
||||
if (suppressing)
|
||||
bch_err(c, "Ratelimiting new instances of previous error");
|
||||
|
||||
mutex_unlock(&c->fsck_error_lock);
|
||||
|
||||
if (fix)
|
||||
set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
|
||||
|
||||
return fix ? FSCK_ERR_FIX
|
||||
: flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE
|
||||
: FSCK_ERR_EXIT;
|
||||
}
|
||||
|
||||
void bch2_flush_fsck_errs(struct bch_fs *c)
|
||||
{
|
||||
struct fsck_err_state *s, *n;
|
||||
|
||||
mutex_lock(&c->fsck_error_lock);
|
||||
set_bit(BCH_FS_FSCK_DONE, &c->flags);
|
||||
|
||||
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
|
||||
if (s->nr > 10)
|
||||
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
|
||||
|
||||
list_del(&s->list);
|
||||
kfree(s);
|
||||
}
|
||||
|
||||
mutex_unlock(&c->fsck_error_lock);
|
||||
}
|
229
fs/bcachefs/error.h
Normal file
229
fs/bcachefs/error.h
Normal file
@ -0,0 +1,229 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_ERROR_H
|
||||
#define _BCACHEFS_ERROR_H
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/printk.h>
|
||||
|
||||
struct bch_dev;
|
||||
struct bch_fs;
|
||||
struct work_struct;
|
||||
|
||||
/*
|
||||
* XXX: separate out errors that indicate on disk data is inconsistent, and flag
|
||||
* superblock as such
|
||||
*/
|
||||
|
||||
/* Error messages: */
|
||||
|
||||
/*
|
||||
* Very fatal logic/inconsistency errors: these indicate that we've majorly
|
||||
* screwed up at runtime, i.e. it's not likely that it was just caused by the
|
||||
* data on disk being inconsistent. These BUG():
|
||||
*
|
||||
* XXX: audit and convert to inconsistent() checks
|
||||
*/
|
||||
|
||||
#define bch2_fs_bug(c, ...) \
|
||||
do { \
|
||||
bch_err(c, __VA_ARGS__); \
|
||||
BUG(); \
|
||||
} while (0)
|
||||
|
||||
#define bch2_fs_bug_on(cond, c, ...) \
|
||||
do { \
|
||||
if (cond) \
|
||||
bch2_fs_bug(c, __VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Inconsistency errors: The on disk data is inconsistent. If these occur during
|
||||
* initial recovery, they don't indicate a bug in the running code - we walk all
|
||||
* the metadata before modifying anything. If they occur at runtime, they
|
||||
* indicate either a bug in the running code or (less likely) data is being
|
||||
* silently corrupted under us.
|
||||
*
|
||||
* XXX: audit all inconsistent errors and make sure they're all recoverable, in
|
||||
* BCH_ON_ERROR_CONTINUE mode
|
||||
*/
|
||||
|
||||
bool bch2_inconsistent_error(struct bch_fs *);
|
||||
|
||||
#define bch2_fs_inconsistent(c, ...) \
|
||||
({ \
|
||||
bch_err(c, __VA_ARGS__); \
|
||||
bch2_inconsistent_error(c); \
|
||||
})
|
||||
|
||||
#define bch2_fs_inconsistent_on(cond, c, ...) \
|
||||
({ \
|
||||
int _ret = !!(cond); \
|
||||
\
|
||||
if (_ret) \
|
||||
bch2_fs_inconsistent(c, __VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Later we might want to mark only the particular device inconsistent, not the
|
||||
* entire filesystem:
|
||||
*/
|
||||
|
||||
#define bch2_dev_inconsistent(ca, ...) \
|
||||
do { \
|
||||
bch_err(ca, __VA_ARGS__); \
|
||||
bch2_inconsistent_error((ca)->fs); \
|
||||
} while (0)
|
||||
|
||||
#define bch2_dev_inconsistent_on(cond, ca, ...) \
|
||||
({ \
|
||||
int _ret = !!(cond); \
|
||||
\
|
||||
if (_ret) \
|
||||
bch2_dev_inconsistent(ca, __VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
|
||||
* be able to repair:
|
||||
*/
|
||||
|
||||
enum {
|
||||
BCH_FSCK_OK = 0,
|
||||
BCH_FSCK_ERRORS_NOT_FIXED = 1,
|
||||
BCH_FSCK_REPAIR_UNIMPLEMENTED = 2,
|
||||
BCH_FSCK_REPAIR_IMPOSSIBLE = 3,
|
||||
BCH_FSCK_UNKNOWN_VERSION = 4,
|
||||
};
|
||||
|
||||
enum fsck_err_opts {
|
||||
FSCK_OPT_EXIT,
|
||||
FSCK_OPT_YES,
|
||||
FSCK_OPT_NO,
|
||||
FSCK_OPT_ASK,
|
||||
};
|
||||
|
||||
enum fsck_err_ret {
|
||||
FSCK_ERR_IGNORE = 0,
|
||||
FSCK_ERR_FIX = 1,
|
||||
FSCK_ERR_EXIT = 2,
|
||||
};
|
||||
|
||||
struct fsck_err_state {
|
||||
struct list_head list;
|
||||
const char *fmt;
|
||||
u64 nr;
|
||||
char buf[512];
|
||||
};
|
||||
|
||||
#define FSCK_CAN_FIX (1 << 0)
|
||||
#define FSCK_CAN_IGNORE (1 << 1)
|
||||
#define FSCK_NEED_FSCK (1 << 2)
|
||||
|
||||
enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
|
||||
unsigned, const char *, ...);
|
||||
void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
|
||||
#define __fsck_err(c, _flags, msg, ...) \
|
||||
({ \
|
||||
int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
|
||||
\
|
||||
if (_fix == FSCK_ERR_EXIT) { \
|
||||
bch_err(c, "Unable to continue, halting"); \
|
||||
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
||||
goto fsck_err; \
|
||||
} \
|
||||
\
|
||||
_fix; \
|
||||
})
|
||||
|
||||
/* These macros return true if error should be fixed: */
|
||||
|
||||
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
|
||||
|
||||
#define __fsck_err_on(cond, c, _flags, ...) \
|
||||
((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
|
||||
|
||||
#define need_fsck_err_on(cond, c, ...) \
|
||||
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
|
||||
|
||||
#define need_fsck_err(c, ...) \
|
||||
__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
|
||||
|
||||
#define mustfix_fsck_err(c, ...) \
|
||||
__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
|
||||
|
||||
#define mustfix_fsck_err_on(cond, c, ...) \
|
||||
__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
|
||||
|
||||
#define fsck_err(c, ...) \
|
||||
__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
|
||||
|
||||
#define fsck_err_on(cond, c, ...) \
|
||||
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
|
||||
|
||||
/*
|
||||
* Fatal errors: these don't indicate a bug, but we can't continue running in RW
|
||||
* mode - pretty much just due to metadata IO errors:
|
||||
*/
|
||||
|
||||
void bch2_fatal_error(struct bch_fs *);
|
||||
|
||||
#define bch2_fs_fatal_error(c, ...) \
|
||||
do { \
|
||||
bch_err(c, __VA_ARGS__); \
|
||||
bch2_fatal_error(c); \
|
||||
} while (0)
|
||||
|
||||
#define bch2_fs_fatal_err_on(cond, c, ...) \
|
||||
({ \
|
||||
int _ret = !!(cond); \
|
||||
\
|
||||
if (_ret) \
|
||||
bch2_fs_fatal_error(c, __VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* IO errors: either recoverable metadata IO (because we have replicas), or data
|
||||
* IO - we need to log it and print out a message, but we don't (necessarily)
|
||||
* want to shut down the fs:
|
||||
*/
|
||||
|
||||
void bch2_io_error_work(struct work_struct *);
|
||||
|
||||
/* Does the error handling without logging a message */
|
||||
void bch2_io_error(struct bch_dev *);
|
||||
|
||||
/* Logs message and handles the error: */
|
||||
#define bch2_dev_io_error(ca, fmt, ...) \
|
||||
do { \
|
||||
printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
|
||||
"IO error on %s for " fmt), \
|
||||
(ca)->name, ##__VA_ARGS__); \
|
||||
bch2_io_error(ca); \
|
||||
} while (0)
|
||||
|
||||
#define bch2_dev_io_err_on(cond, ca, ...) \
|
||||
({ \
|
||||
bool _ret = (cond); \
|
||||
\
|
||||
if (_ret) \
|
||||
bch2_dev_io_error(ca, __VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
/* kill? */
|
||||
|
||||
#define __bcache_io_error(c, fmt, ...) \
|
||||
printk_ratelimited(KERN_ERR bch2_fmt(c, \
|
||||
"IO error: " fmt), ##__VA_ARGS__)
|
||||
|
||||
#define bcache_io_error(c, bio, fmt, ...) \
|
||||
do { \
|
||||
__bcache_io_error(c, fmt, ##__VA_ARGS__); \
|
||||
(bio)->bi_status = BLK_STS_IOERR; \
|
||||
} while (0)
|
||||
|
||||
#endif /* _BCACHEFS_ERROR_H */
|
2395
fs/bcachefs/extents.c
Normal file
2395
fs/bcachefs/extents.c
Normal file
File diff suppressed because it is too large
Load Diff
539
fs/bcachefs/extents.h
Normal file
539
fs/bcachefs/extents.h
Normal file
@ -0,0 +1,539 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_EXTENTS_H
|
||||
#define _BCACHEFS_EXTENTS_H
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey.h"
|
||||
#include "extents_types.h"
|
||||
|
||||
struct bch_fs;
|
||||
struct journal_res;
|
||||
struct btree_node_iter;
|
||||
struct btree_node_iter_large;
|
||||
struct btree_insert;
|
||||
struct btree_insert_entry;
|
||||
struct extent_insert_hook;
|
||||
struct bch_devs_mask;
|
||||
union bch_extent_crc;
|
||||
|
||||
const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
|
||||
struct bkey_s_c);
|
||||
void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
|
||||
void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
|
||||
|
||||
#define bch2_bkey_btree_ops (struct bkey_ops) { \
|
||||
.key_invalid = bch2_btree_ptr_invalid, \
|
||||
.key_debugcheck = bch2_btree_ptr_debugcheck, \
|
||||
.val_to_text = bch2_btree_ptr_to_text, \
|
||||
.swab = bch2_ptr_swab, \
|
||||
}
|
||||
|
||||
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
|
||||
void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
|
||||
bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
|
||||
enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
|
||||
struct bkey_i *, struct bkey_i *);
|
||||
|
||||
#define bch2_bkey_extent_ops (struct bkey_ops) { \
|
||||
.key_invalid = bch2_extent_invalid, \
|
||||
.key_debugcheck = bch2_extent_debugcheck, \
|
||||
.val_to_text = bch2_extent_to_text, \
|
||||
.swab = bch2_ptr_swab, \
|
||||
.key_normalize = bch2_ptr_normalize, \
|
||||
.key_merge = bch2_extent_merge, \
|
||||
.is_extents = true, \
|
||||
}
|
||||
|
||||
struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
|
||||
struct btree *,
|
||||
struct btree_node_iter_large *);
|
||||
struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
|
||||
struct bset *,
|
||||
struct btree *,
|
||||
struct btree_node_iter_large *);
|
||||
|
||||
int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
|
||||
struct bch_devs_mask *avoid,
|
||||
struct extent_pick_ptr *);
|
||||
|
||||
int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_devs_mask *,
|
||||
struct extent_pick_ptr *);
|
||||
|
||||
enum btree_insert_ret
|
||||
bch2_insert_fixup_extent(struct btree_insert *,
|
||||
struct btree_insert_entry *);
|
||||
|
||||
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
|
||||
void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
|
||||
unsigned, unsigned);
|
||||
|
||||
const struct bch_extent_ptr *
|
||||
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
|
||||
bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
|
||||
const struct bch_extent_ptr *
|
||||
bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
|
||||
const struct bch_extent_ptr *
|
||||
bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
|
||||
|
||||
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
|
||||
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
|
||||
unsigned bch2_extent_is_compressed(struct bkey_s_c);
|
||||
|
||||
unsigned bch2_extent_ptr_durability(struct bch_fs *,
|
||||
const struct bch_extent_ptr *);
|
||||
unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
|
||||
|
||||
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
|
||||
struct bch_extent_ptr, u64);
|
||||
|
||||
static inline bool bkey_extent_is_data(const struct bkey *k)
|
||||
{
|
||||
switch (k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bkey_extent_is_allocation(const struct bkey *k)
|
||||
{
|
||||
switch (k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
case BCH_RESERVATION:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
|
||||
{
|
||||
return bkey_extent_is_allocation(k.k) &&
|
||||
!bch2_extent_is_compressed(k);
|
||||
}
|
||||
|
||||
static inline bool bkey_extent_is_cached(const struct bkey *k)
|
||||
{
|
||||
return k->type == BCH_EXTENT_CACHED;
|
||||
}
|
||||
|
||||
static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
|
||||
{
|
||||
EBUG_ON(k->type != BCH_EXTENT &&
|
||||
k->type != BCH_EXTENT_CACHED);
|
||||
|
||||
k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
__extent_entry_type(const union bch_extent_entry *e)
|
||||
{
|
||||
return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
|
||||
}
|
||||
|
||||
static inline enum bch_extent_entry_type
|
||||
extent_entry_type(const union bch_extent_entry *e)
|
||||
{
|
||||
int ret = __ffs(e->type);
|
||||
|
||||
EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
|
||||
{
|
||||
switch (extent_entry_type(entry)) {
|
||||
case BCH_EXTENT_ENTRY_crc32:
|
||||
return sizeof(struct bch_extent_crc32);
|
||||
case BCH_EXTENT_ENTRY_crc64:
|
||||
return sizeof(struct bch_extent_crc64);
|
||||
case BCH_EXTENT_ENTRY_crc128:
|
||||
return sizeof(struct bch_extent_crc128);
|
||||
case BCH_EXTENT_ENTRY_ptr:
|
||||
return sizeof(struct bch_extent_ptr);
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
|
||||
{
|
||||
return extent_entry_bytes(entry) / sizeof(u64);
|
||||
}
|
||||
|
||||
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
|
||||
{
|
||||
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
|
||||
}
|
||||
|
||||
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
|
||||
{
|
||||
return !extent_entry_is_ptr(e);
|
||||
}
|
||||
|
||||
union bch_extent_crc {
|
||||
u8 type;
|
||||
struct bch_extent_crc32 crc32;
|
||||
struct bch_extent_crc64 crc64;
|
||||
struct bch_extent_crc128 crc128;
|
||||
};
|
||||
|
||||
/* downcast, preserves const */
|
||||
#define to_entry(_entry) \
|
||||
({ \
|
||||
BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
|
||||
!type_is(_entry, struct bch_extent_ptr *)); \
|
||||
\
|
||||
__builtin_choose_expr( \
|
||||
(type_is_exact(_entry, const union bch_extent_crc *) || \
|
||||
type_is_exact(_entry, const struct bch_extent_ptr *)), \
|
||||
(const union bch_extent_entry *) (_entry), \
|
||||
(union bch_extent_entry *) (_entry)); \
|
||||
})
|
||||
|
||||
#define __entry_to_crc(_entry) \
|
||||
__builtin_choose_expr( \
|
||||
type_is_exact(_entry, const union bch_extent_entry *), \
|
||||
(const union bch_extent_crc *) (_entry), \
|
||||
(union bch_extent_crc *) (_entry))
|
||||
|
||||
#define entry_to_crc(_entry) \
|
||||
({ \
|
||||
EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
|
||||
\
|
||||
__entry_to_crc(_entry); \
|
||||
})
|
||||
|
||||
#define entry_to_ptr(_entry) \
|
||||
({ \
|
||||
EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
|
||||
\
|
||||
__builtin_choose_expr( \
|
||||
type_is_exact(_entry, const union bch_extent_entry *), \
|
||||
(const struct bch_extent_ptr *) (_entry), \
|
||||
(struct bch_extent_ptr *) (_entry)); \
|
||||
})
|
||||
|
||||
/* checksum entries: */
|
||||
|
||||
enum bch_extent_crc_type {
|
||||
BCH_EXTENT_CRC_NONE,
|
||||
BCH_EXTENT_CRC32,
|
||||
BCH_EXTENT_CRC64,
|
||||
BCH_EXTENT_CRC128,
|
||||
};
|
||||
|
||||
static inline enum bch_extent_crc_type
|
||||
__extent_crc_type(const union bch_extent_crc *crc)
|
||||
{
|
||||
if (!crc)
|
||||
return BCH_EXTENT_CRC_NONE;
|
||||
|
||||
switch (extent_entry_type(to_entry(crc))) {
|
||||
case BCH_EXTENT_ENTRY_crc32:
|
||||
return BCH_EXTENT_CRC32;
|
||||
case BCH_EXTENT_ENTRY_crc64:
|
||||
return BCH_EXTENT_CRC64;
|
||||
case BCH_EXTENT_ENTRY_crc128:
|
||||
return BCH_EXTENT_CRC128;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
#define extent_crc_type(_crc) \
|
||||
({ \
|
||||
BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \
|
||||
!type_is(_crc, struct bch_extent_crc64 *) && \
|
||||
!type_is(_crc, struct bch_extent_crc128 *) && \
|
||||
!type_is(_crc, union bch_extent_crc *)); \
|
||||
\
|
||||
type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \
|
||||
: type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \
|
||||
: type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \
|
||||
: __extent_crc_type((union bch_extent_crc *) _crc); \
|
||||
})
|
||||
|
||||
static inline struct bch_extent_crc_unpacked
|
||||
bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
|
||||
{
|
||||
#define common_fields(_crc) \
|
||||
.csum_type = _crc.csum_type, \
|
||||
.compression_type = _crc.compression_type, \
|
||||
.compressed_size = _crc._compressed_size + 1, \
|
||||
.uncompressed_size = _crc._uncompressed_size + 1, \
|
||||
.offset = _crc.offset, \
|
||||
.live_size = k->size
|
||||
|
||||
switch (extent_crc_type(crc)) {
|
||||
case BCH_EXTENT_CRC_NONE:
|
||||
return (struct bch_extent_crc_unpacked) {
|
||||
.compressed_size = k->size,
|
||||
.uncompressed_size = k->size,
|
||||
.live_size = k->size,
|
||||
};
|
||||
case BCH_EXTENT_CRC32: {
|
||||
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc32),
|
||||
};
|
||||
|
||||
*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
|
||||
|
||||
memcpy(&ret.csum.lo, &crc->crc32.csum,
|
||||
sizeof(crc->crc32.csum));
|
||||
|
||||
return ret;
|
||||
}
|
||||
case BCH_EXTENT_CRC64: {
|
||||
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc64),
|
||||
.nonce = crc->crc64.nonce,
|
||||
.csum.lo = (__force __le64) crc->crc64.csum_lo,
|
||||
};
|
||||
|
||||
*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
|
||||
|
||||
return ret;
|
||||
}
|
||||
case BCH_EXTENT_CRC128: {
|
||||
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
||||
common_fields(crc->crc128),
|
||||
.nonce = crc->crc128.nonce,
|
||||
.csum = crc->crc128.csum,
|
||||
};
|
||||
|
||||
return ret;
|
||||
}
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
#undef common_fields
|
||||
}
|
||||
|
||||
/* Extent entry iteration: */
|
||||
|
||||
#define extent_entry_next(_entry) \
|
||||
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
|
||||
|
||||
#define extent_entry_last(_e) \
|
||||
vstruct_idx((_e).v, bkey_val_u64s((_e).k))
|
||||
|
||||
/* Iterate over all entries: */
|
||||
|
||||
#define extent_for_each_entry_from(_e, _entry, _start) \
|
||||
for ((_entry) = _start; \
|
||||
(_entry) < extent_entry_last(_e); \
|
||||
(_entry) = extent_entry_next(_entry))
|
||||
|
||||
#define extent_for_each_entry(_e, _entry) \
|
||||
extent_for_each_entry_from(_e, _entry, (_e).v->start)
|
||||
|
||||
/* Iterate over crcs only: */
|
||||
|
||||
#define __extent_crc_next(_e, _p) \
|
||||
({ \
|
||||
typeof(&(_e).v->start[0]) _entry = _p; \
|
||||
\
|
||||
while ((_entry) < extent_entry_last(_e) && \
|
||||
!extent_entry_is_crc(_entry)) \
|
||||
(_entry) = extent_entry_next(_entry); \
|
||||
\
|
||||
entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \
|
||||
})
|
||||
|
||||
#define __extent_for_each_crc(_e, _crc) \
|
||||
for ((_crc) = __extent_crc_next(_e, (_e).v->start); \
|
||||
(_crc); \
|
||||
(_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
|
||||
|
||||
#define extent_crc_next(_e, _crc, _iter) \
|
||||
({ \
|
||||
extent_for_each_entry_from(_e, _iter, _iter) \
|
||||
if (extent_entry_is_crc(_iter)) { \
|
||||
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
|
||||
break; \
|
||||
} \
|
||||
\
|
||||
(_iter) < extent_entry_last(_e); \
|
||||
})
|
||||
|
||||
#define extent_for_each_crc(_e, _crc, _iter) \
|
||||
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
|
||||
(_iter) = (_e).v->start; \
|
||||
extent_crc_next(_e, _crc, _iter); \
|
||||
(_iter) = extent_entry_next(_iter))
|
||||
|
||||
/* Iterate over pointers, with crcs: */
|
||||
|
||||
#define extent_ptr_crc_next(_e, _ptr, _crc) \
|
||||
({ \
|
||||
__label__ out; \
|
||||
typeof(&(_e).v->start[0]) _entry; \
|
||||
\
|
||||
extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \
|
||||
if (extent_entry_is_crc(_entry)) { \
|
||||
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
|
||||
} else { \
|
||||
_ptr = entry_to_ptr(_entry); \
|
||||
goto out; \
|
||||
} \
|
||||
\
|
||||
_ptr = NULL; \
|
||||
out: \
|
||||
_ptr; \
|
||||
})
|
||||
|
||||
#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
|
||||
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
|
||||
(_ptr) = &(_e).v->start->ptr; \
|
||||
((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \
|
||||
(_ptr)++)
|
||||
|
||||
/* Iterate over pointers only, and from a given position: */
|
||||
|
||||
#define extent_ptr_next(_e, _ptr) \
|
||||
({ \
|
||||
struct bch_extent_crc_unpacked _crc; \
|
||||
\
|
||||
extent_ptr_crc_next(_e, _ptr, _crc); \
|
||||
})
|
||||
|
||||
#define extent_for_each_ptr(_e, _ptr) \
|
||||
for ((_ptr) = &(_e).v->start->ptr; \
|
||||
((_ptr) = extent_ptr_next(_e, _ptr)); \
|
||||
(_ptr)++)
|
||||
|
||||
#define extent_ptr_prev(_e, _ptr) \
|
||||
({ \
|
||||
typeof(&(_e).v->start->ptr) _p; \
|
||||
typeof(&(_e).v->start->ptr) _prev = NULL; \
|
||||
\
|
||||
extent_for_each_ptr(_e, _p) { \
|
||||
if (_p == (_ptr)) \
|
||||
break; \
|
||||
_prev = _p; \
|
||||
} \
|
||||
\
|
||||
_prev; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Use this when you'll be dropping pointers as you iterate. Quadratic,
|
||||
* unfortunately:
|
||||
*/
|
||||
#define extent_for_each_ptr_backwards(_e, _ptr) \
|
||||
for ((_ptr) = extent_ptr_prev(_e, NULL); \
|
||||
(_ptr); \
|
||||
(_ptr) = extent_ptr_prev(_e, _ptr))
|
||||
|
||||
void bch2_extent_crc_append(struct bkey_i_extent *,
|
||||
struct bch_extent_crc_unpacked);
|
||||
|
||||
static inline void __extent_entry_push(struct bkey_i_extent *e)
|
||||
{
|
||||
union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
|
||||
|
||||
EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
|
||||
BKEY_EXTENT_VAL_U64s_MAX);
|
||||
|
||||
e->k.u64s += extent_entry_u64s(entry);
|
||||
}
|
||||
|
||||
static inline void extent_ptr_append(struct bkey_i_extent *e,
|
||||
struct bch_extent_ptr ptr)
|
||||
{
|
||||
ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
|
||||
extent_entry_last(extent_i_to_s(e))->ptr = ptr;
|
||||
__extent_entry_push(e);
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
|
||||
{
|
||||
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
||||
const struct bch_extent_ptr *ptr;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
ret.devs[ret.nr++] = ptr->dev;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
|
||||
{
|
||||
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
||||
const struct bch_extent_ptr *ptr;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached)
|
||||
ret.devs[ret.nr++] = ptr->dev;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
|
||||
{
|
||||
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
||||
const struct bch_extent_ptr *ptr;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (ptr->cached)
|
||||
ret.devs[ret.nr++] = ptr->dev;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
return bch2_extent_devs(bkey_s_c_to_extent(k));
|
||||
default:
|
||||
return (struct bch_devs_list) { .nr = 0 };
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
|
||||
default:
|
||||
return (struct bch_devs_list) { .nr = 0 };
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case BCH_EXTENT:
|
||||
case BCH_EXTENT_CACHED:
|
||||
return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
|
||||
default:
|
||||
return (struct bch_devs_list) { .nr = 0 };
|
||||
}
|
||||
}
|
||||
|
||||
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
|
||||
struct bch_extent_crc_unpacked);
|
||||
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
|
||||
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
|
||||
|
||||
void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
|
||||
void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
|
||||
|
||||
bool bch2_cut_front(struct bpos, struct bkey_i *);
|
||||
bool bch2_cut_back(struct bpos, struct bkey *);
|
||||
void bch2_key_resize(struct bkey *, unsigned);
|
||||
|
||||
int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
|
||||
|
||||
#endif /* _BCACHEFS_EXTENTS_H */
|
27
fs/bcachefs/extents_types.h
Normal file
27
fs/bcachefs/extents_types.h
Normal file
@ -0,0 +1,27 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_EXTENTS_TYPES_H
|
||||
#define _BCACHEFS_EXTENTS_TYPES_H
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
struct bch_extent_crc_unpacked {
|
||||
u8 csum_type;
|
||||
u8 compression_type;
|
||||
|
||||
u16 compressed_size;
|
||||
u16 uncompressed_size;
|
||||
|
||||
u16 offset;
|
||||
u16 live_size;
|
||||
|
||||
u16 nonce;
|
||||
|
||||
struct bch_csum csum;
|
||||
};
|
||||
|
||||
struct extent_pick_ptr {
|
||||
struct bch_extent_ptr ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
283
fs/bcachefs/eytzinger.h
Normal file
283
fs/bcachefs/eytzinger.h
Normal file
@ -0,0 +1,283 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _EYTZINGER_H
|
||||
#define _EYTZINGER_H
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/log2.h>
|
||||
|
||||
#include "util.h"
|
||||
|
||||
/*
|
||||
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
|
||||
* array
|
||||
*/
|
||||
|
||||
/*
|
||||
* One based indexing version:
|
||||
*
|
||||
* With one based indexing each level of the tree starts at a power of two -
|
||||
* good for cacheline alignment:
|
||||
*
|
||||
* Size parameter is treated as if we were using 0 based indexing, however:
|
||||
* valid nodes, and inorder indices, are in the range [1..size) - that is, there
|
||||
* are actually size - 1 elements
|
||||
*/
|
||||
|
||||
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
|
||||
{
|
||||
EBUG_ON(child > 1);
|
||||
|
||||
return (i << 1) + child;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_left_child(unsigned i)
|
||||
{
|
||||
return eytzinger1_child(i, 0);
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_right_child(unsigned i)
|
||||
{
|
||||
return eytzinger1_child(i, 1);
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_first(unsigned size)
|
||||
{
|
||||
return rounddown_pow_of_two(size - 1);
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_last(unsigned size)
|
||||
{
|
||||
return rounddown_pow_of_two(size) - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* eytzinger1_next() and eytzinger1_prev() have the nice properties that
|
||||
*
|
||||
* eytzinger1_next(0) == eytzinger1_first())
|
||||
* eytzinger1_prev(0) == eytzinger1_last())
|
||||
*
|
||||
* eytzinger1_prev(eytzinger1_first()) == 0
|
||||
* eytzinger1_next(eytzinger1_last()) == 0
|
||||
*/
|
||||
|
||||
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
|
||||
{
|
||||
EBUG_ON(i >= size);
|
||||
|
||||
if (eytzinger1_right_child(i) < size) {
|
||||
i = eytzinger1_right_child(i);
|
||||
|
||||
i <<= __fls(size) - __fls(i);
|
||||
i >>= i >= size;
|
||||
} else {
|
||||
i >>= ffz(i) + 1;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
|
||||
{
|
||||
EBUG_ON(i >= size);
|
||||
|
||||
if (eytzinger1_left_child(i) < size) {
|
||||
i = eytzinger1_left_child(i) + 1;
|
||||
|
||||
i <<= __fls(size) - __fls(i);
|
||||
i -= 1;
|
||||
i >>= i >= size;
|
||||
} else {
|
||||
i >>= __ffs(i) + 1;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_extra(unsigned size)
|
||||
{
|
||||
return (size - rounddown_pow_of_two(size - 1)) << 1;
|
||||
}
|
||||
|
||||
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
|
||||
unsigned extra)
|
||||
{
|
||||
unsigned b = __fls(i);
|
||||
unsigned shift = __fls(size - 1) - b;
|
||||
int s;
|
||||
|
||||
EBUG_ON(!i || i >= size);
|
||||
|
||||
i ^= 1U << b;
|
||||
i <<= 1;
|
||||
i |= 1;
|
||||
i <<= shift;
|
||||
|
||||
/*
|
||||
* sign bit trick:
|
||||
*
|
||||
* if (i > extra)
|
||||
* i -= (i - extra) >> 1;
|
||||
*/
|
||||
s = extra - i;
|
||||
i += (s >> 1) & (s >> 31);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
|
||||
unsigned extra)
|
||||
{
|
||||
unsigned shift;
|
||||
int s;
|
||||
|
||||
EBUG_ON(!i || i >= size);
|
||||
|
||||
/*
|
||||
* sign bit trick:
|
||||
*
|
||||
* if (i > extra)
|
||||
* i += i - extra;
|
||||
*/
|
||||
s = extra - i;
|
||||
i -= s & (s >> 31);
|
||||
|
||||
shift = __ffs(i);
|
||||
|
||||
i >>= shift + 1;
|
||||
i |= 1U << (__fls(size - 1) - shift);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
|
||||
{
|
||||
return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
|
||||
}
|
||||
|
||||
static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
|
||||
{
|
||||
return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
|
||||
}
|
||||
|
||||
#define eytzinger1_for_each(_i, _size) \
|
||||
for ((_i) = eytzinger1_first((_size)); \
|
||||
(_i) != 0; \
|
||||
(_i) = eytzinger1_next((_i), (_size)))
|
||||
|
||||
/* Zero based indexing version: */
|
||||
|
||||
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
|
||||
{
|
||||
EBUG_ON(child > 1);
|
||||
|
||||
return (i << 1) + 1 + child;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_left_child(unsigned i)
|
||||
{
|
||||
return eytzinger0_child(i, 0);
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_right_child(unsigned i)
|
||||
{
|
||||
return eytzinger0_child(i, 1);
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_first(unsigned size)
|
||||
{
|
||||
return eytzinger1_first(size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_last(unsigned size)
|
||||
{
|
||||
return eytzinger1_last(size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
|
||||
{
|
||||
return eytzinger1_next(i + 1, size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
|
||||
{
|
||||
return eytzinger1_prev(i + 1, size + 1) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_extra(unsigned size)
|
||||
{
|
||||
return eytzinger1_extra(size + 1);
|
||||
}
|
||||
|
||||
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
|
||||
unsigned extra)
|
||||
{
|
||||
return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
|
||||
unsigned extra)
|
||||
{
|
||||
return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
|
||||
}
|
||||
|
||||
static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
|
||||
{
|
||||
return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
|
||||
}
|
||||
|
||||
static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
|
||||
{
|
||||
return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
|
||||
}
|
||||
|
||||
#define eytzinger0_for_each(_i, _size) \
|
||||
for ((_i) = eytzinger0_first((_size)); \
|
||||
(_i) != -1; \
|
||||
(_i) = eytzinger0_next((_i), (_size)))
|
||||
|
||||
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
|
||||
|
||||
/* return greatest node <= @search, or -1 if not found */
|
||||
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
|
||||
eytzinger_cmp_fn cmp, const void *search)
|
||||
{
|
||||
unsigned i, n = 0;
|
||||
|
||||
if (!nr)
|
||||
return -1;
|
||||
|
||||
do {
|
||||
i = n;
|
||||
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
|
||||
} while (n < nr);
|
||||
|
||||
if (n & 1) {
|
||||
/* @i was greater than @search, return previous node: */
|
||||
|
||||
if (i == eytzinger0_first(nr))
|
||||
return -1;
|
||||
|
||||
return eytzinger0_prev(i, nr);
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
|
||||
eytzinger_cmp_fn cmp, const void *search)
|
||||
{
|
||||
size_t i = 0;
|
||||
int res;
|
||||
|
||||
while (i < nr &&
|
||||
(res = cmp(search, base + i * size, size)))
|
||||
i = eytzinger0_child(i, res > 0);
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void eytzinger0_sort(void *, size_t, size_t,
|
||||
int (*cmp_func)(const void *, const void *, size_t),
|
||||
void (*swap_func)(void *, void *, size_t));
|
||||
|
||||
#endif /* _EYTZINGER_H */
|
125
fs/bcachefs/fifo.h
Normal file
125
fs/bcachefs/fifo.h
Normal file
@ -0,0 +1,125 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FIFO_H
|
||||
#define _BCACHEFS_FIFO_H
|
||||
|
||||
#include "util.h"
|
||||
|
||||
#define FIFO(type) \
|
||||
struct { \
|
||||
size_t front, back, size, mask; \
|
||||
type *data; \
|
||||
}
|
||||
|
||||
#define DECLARE_FIFO(type, name) FIFO(type) name
|
||||
|
||||
#define fifo_buf_size(fifo) \
|
||||
(roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
|
||||
|
||||
#define init_fifo(fifo, _size, _gfp) \
|
||||
({ \
|
||||
(fifo)->front = (fifo)->back = 0; \
|
||||
(fifo)->size = (_size); \
|
||||
(fifo)->mask = (fifo)->size \
|
||||
? roundup_pow_of_two((fifo)->size) - 1 \
|
||||
: 0; \
|
||||
(fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
|
||||
})
|
||||
|
||||
#define free_fifo(fifo) \
|
||||
do { \
|
||||
kvpfree((fifo)->data, fifo_buf_size(fifo)); \
|
||||
(fifo)->data = NULL; \
|
||||
} while (0)
|
||||
|
||||
#define fifo_swap(l, r) \
|
||||
do { \
|
||||
swap((l)->front, (r)->front); \
|
||||
swap((l)->back, (r)->back); \
|
||||
swap((l)->size, (r)->size); \
|
||||
swap((l)->mask, (r)->mask); \
|
||||
swap((l)->data, (r)->data); \
|
||||
} while (0)
|
||||
|
||||
#define fifo_move(dest, src) \
|
||||
do { \
|
||||
typeof(*((dest)->data)) _t; \
|
||||
while (!fifo_full(dest) && \
|
||||
fifo_pop(src, _t)) \
|
||||
fifo_push(dest, _t); \
|
||||
} while (0)
|
||||
|
||||
#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
|
||||
#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
|
||||
|
||||
#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
|
||||
#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
|
||||
|
||||
#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
|
||||
#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
|
||||
|
||||
#define fifo_entry_idx_abs(fifo, p) \
|
||||
((((p) >= &fifo_peek_front(fifo) \
|
||||
? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
|
||||
(((p) - (fifo)->data)))
|
||||
|
||||
#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
|
||||
#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
|
||||
|
||||
#define fifo_push_back_ref(f) \
|
||||
(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
|
||||
|
||||
#define fifo_push_front_ref(f) \
|
||||
(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
|
||||
|
||||
#define fifo_push_back(fifo, new) \
|
||||
({ \
|
||||
typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \
|
||||
if (_r) \
|
||||
*_r = (new); \
|
||||
_r != NULL; \
|
||||
})
|
||||
|
||||
#define fifo_push_front(fifo, new) \
|
||||
({ \
|
||||
typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \
|
||||
if (_r) \
|
||||
*_r = (new); \
|
||||
_r != NULL; \
|
||||
})
|
||||
|
||||
#define fifo_pop_front(fifo, i) \
|
||||
({ \
|
||||
bool _r = !fifo_empty((fifo)); \
|
||||
if (_r) \
|
||||
(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
|
||||
_r; \
|
||||
})
|
||||
|
||||
#define fifo_pop_back(fifo, i) \
|
||||
({ \
|
||||
bool _r = !fifo_empty((fifo)); \
|
||||
if (_r) \
|
||||
(i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \
|
||||
_r; \
|
||||
})
|
||||
|
||||
#define fifo_push_ref(fifo) fifo_push_back_ref(fifo)
|
||||
#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
|
||||
#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
|
||||
#define fifo_peek(fifo) fifo_peek_front(fifo)
|
||||
|
||||
#define fifo_for_each_entry(_entry, _fifo, _iter) \
|
||||
for (((void) (&(_iter) == &(_fifo)->front)), \
|
||||
_iter = (_fifo)->front; \
|
||||
((_iter != (_fifo)->back) && \
|
||||
(_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
|
||||
_iter++)
|
||||
|
||||
#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
|
||||
for (((void) (&(_iter) == &(_fifo)->front)), \
|
||||
_iter = (_fifo)->front; \
|
||||
((_iter != (_fifo)->back) && \
|
||||
(_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
|
||||
_iter++)
|
||||
|
||||
#endif /* _BCACHEFS_FIFO_H */
|
2862
fs/bcachefs/fs-io.c
Normal file
2862
fs/bcachefs/fs-io.c
Normal file
File diff suppressed because it is too large
Load Diff
47
fs/bcachefs/fs-io.h
Normal file
47
fs/bcachefs/fs-io.h
Normal file
@ -0,0 +1,47 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FS_IO_H
|
||||
#define _BCACHEFS_FS_IO_H
|
||||
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
#include "buckets.h"
|
||||
#include "io_types.h"
|
||||
|
||||
#include <linux/uio.h>
|
||||
|
||||
bool bch2_dirty_folio(struct address_space *, struct folio *);
|
||||
|
||||
int bch2_writepage(struct page *, struct writeback_control *);
|
||||
int bch2_read_folio(struct file *, struct folio *);
|
||||
|
||||
int bch2_writepages(struct address_space *, struct writeback_control *);
|
||||
void bch2_readahead(struct readahead_control *);
|
||||
|
||||
int bch2_write_begin(struct file *, struct address_space *, loff_t,
|
||||
unsigned, struct page **, void **);
|
||||
int bch2_write_end(struct file *, struct address_space *, loff_t,
|
||||
unsigned, unsigned, struct page *, void *);
|
||||
|
||||
ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
|
||||
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
|
||||
|
||||
int bch2_fsync(struct file *, loff_t, loff_t, int);
|
||||
|
||||
int bch2_truncate(struct bch_inode_info *, struct iattr *);
|
||||
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
|
||||
|
||||
loff_t bch2_llseek(struct file *, loff_t, int);
|
||||
|
||||
vm_fault_t bch2_page_fault(struct vm_fault *);
|
||||
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
|
||||
void bch2_invalidate_folio(struct folio *, size_t, size_t);
|
||||
bool bch2_release_folio(struct folio *, gfp_t);
|
||||
|
||||
void bch2_fs_fsio_exit(struct bch_fs *);
|
||||
int bch2_fs_fsio_init(struct bch_fs *);
|
||||
#else
|
||||
static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_FS_IO_H */
|
312
fs/bcachefs/fs-ioctl.c
Normal file
312
fs/bcachefs/fs-ioctl.c
Normal file
@ -0,0 +1,312 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "chardev.h"
|
||||
#include "fs.h"
|
||||
#include "fs-ioctl.h"
|
||||
#include "quota.h"
|
||||
|
||||
#include <linux/compat.h>
|
||||
#include <linux/mount.h>
|
||||
|
||||
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
|
||||
|
||||
/* Inode flags: */
|
||||
|
||||
/* bcachefs inode flags -> vfs inode flags: */
|
||||
static const unsigned bch_flags_to_vfs[] = {
|
||||
[__BCH_INODE_SYNC] = S_SYNC,
|
||||
[__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
|
||||
[__BCH_INODE_APPEND] = S_APPEND,
|
||||
[__BCH_INODE_NOATIME] = S_NOATIME,
|
||||
};
|
||||
|
||||
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
|
||||
static const unsigned bch_flags_to_uflags[] = {
|
||||
[__BCH_INODE_SYNC] = FS_SYNC_FL,
|
||||
[__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
|
||||
[__BCH_INODE_APPEND] = FS_APPEND_FL,
|
||||
[__BCH_INODE_NODUMP] = FS_NODUMP_FL,
|
||||
[__BCH_INODE_NOATIME] = FS_NOATIME_FL,
|
||||
};
|
||||
|
||||
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
|
||||
static const unsigned bch_flags_to_xflags[] = {
|
||||
[__BCH_INODE_SYNC] = FS_XFLAG_SYNC,
|
||||
[__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
|
||||
[__BCH_INODE_APPEND] = FS_XFLAG_APPEND,
|
||||
[__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP,
|
||||
[__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME,
|
||||
//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
|
||||
};
|
||||
|
||||
#define set_flags(_map, _in, _out) \
|
||||
do { \
|
||||
unsigned _i; \
|
||||
\
|
||||
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
||||
if ((_in) & (1 << _i)) \
|
||||
(_out) |= _map[_i]; \
|
||||
else \
|
||||
(_out) &= ~_map[_i]; \
|
||||
} while (0)
|
||||
|
||||
#define map_flags(_map, _in) \
|
||||
({ \
|
||||
unsigned _out = 0; \
|
||||
\
|
||||
set_flags(_map, _in, _out); \
|
||||
_out; \
|
||||
})
|
||||
|
||||
#define map_flags_rev(_map, _in) \
|
||||
({ \
|
||||
unsigned _i, _out = 0; \
|
||||
\
|
||||
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
||||
if ((_in) & _map[_i]) { \
|
||||
(_out) |= 1 << _i; \
|
||||
(_in) &= ~_map[_i]; \
|
||||
} \
|
||||
(_out); \
|
||||
})
|
||||
|
||||
#define map_defined(_map) \
|
||||
({ \
|
||||
unsigned _in = ~0; \
|
||||
\
|
||||
map_flags_rev(_map, _in); \
|
||||
})
|
||||
|
||||
/* Set VFS inode flags from bcachefs inode: */
|
||||
void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
|
||||
{
|
||||
set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
|
||||
}
|
||||
|
||||
struct flags_set {
|
||||
unsigned mask;
|
||||
unsigned flags;
|
||||
|
||||
unsigned projid;
|
||||
};
|
||||
|
||||
static int bch2_inode_flags_set(struct bch_inode_info *inode,
|
||||
struct bch_inode_unpacked *bi,
|
||||
void *p)
|
||||
{
|
||||
/*
|
||||
* We're relying on btree locking here for exclusion with other ioctl
|
||||
* calls - use the flags in the btree (@bi), not inode->i_flags:
|
||||
*/
|
||||
struct flags_set *s = p;
|
||||
unsigned newflags = s->flags;
|
||||
unsigned oldflags = bi->bi_flags & s->mask;
|
||||
|
||||
if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
|
||||
!capable(CAP_LINUX_IMMUTABLE))
|
||||
return -EPERM;
|
||||
|
||||
if (!S_ISREG(inode->v.i_mode) &&
|
||||
!S_ISDIR(inode->v.i_mode) &&
|
||||
(newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
|
||||
return -EINVAL;
|
||||
|
||||
bi->bi_flags &= ~s->mask;
|
||||
bi->bi_flags |= newflags;
|
||||
inode_set_ctime_current(&inode->v);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
|
||||
{
|
||||
unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
|
||||
|
||||
return put_user(flags, arg);
|
||||
}
|
||||
|
||||
static int bch2_ioc_setflags(struct bch_fs *c,
|
||||
struct file *file,
|
||||
struct bch_inode_info *inode,
|
||||
void __user *arg)
|
||||
{
|
||||
struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
|
||||
unsigned uflags;
|
||||
int ret;
|
||||
|
||||
if (get_user(uflags, (int __user *) arg))
|
||||
return -EFAULT;
|
||||
|
||||
s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
|
||||
if (uflags)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode_lock(&inode->v);
|
||||
if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
|
||||
ret = -EACCES;
|
||||
goto setflags_out;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
|
||||
|
||||
if (!ret)
|
||||
bch2_inode_flags_to_vfs(inode);
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
|
||||
setflags_out:
|
||||
inode_unlock(&inode->v);
|
||||
mnt_drop_write_file(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
|
||||
struct fsxattr __user *arg)
|
||||
{
|
||||
struct fsxattr fa = { 0 };
|
||||
|
||||
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
|
||||
fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
|
||||
|
||||
return copy_to_user(arg, &fa, sizeof(fa));
|
||||
}
|
||||
|
||||
static int bch2_set_projid(struct bch_fs *c,
|
||||
struct bch_inode_info *inode,
|
||||
u32 projid)
|
||||
{
|
||||
struct bch_qid qid = inode->ei_qid;
|
||||
int ret;
|
||||
|
||||
if (projid == inode->ei_qid.q[QTYP_PRJ])
|
||||
return 0;
|
||||
|
||||
qid.q[QTYP_PRJ] = projid;
|
||||
|
||||
return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
|
||||
inode->v.i_blocks +
|
||||
inode->ei_quota_reserved);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode->ei_qid.q[QTYP_PRJ] = projid;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
|
||||
struct bch_inode_unpacked *bi,
|
||||
void *p)
|
||||
{
|
||||
struct flags_set *s = p;
|
||||
|
||||
bi->bi_project = s->projid;
|
||||
|
||||
return bch2_inode_flags_set(inode, bi, p);
|
||||
}
|
||||
|
||||
static int bch2_ioc_fssetxattr(struct bch_fs *c,
|
||||
struct file *file,
|
||||
struct bch_inode_info *inode,
|
||||
struct fsxattr __user *arg)
|
||||
{
|
||||
struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
|
||||
struct fsxattr fa;
|
||||
int ret;
|
||||
|
||||
if (copy_from_user(&fa, arg, sizeof(fa)))
|
||||
return -EFAULT;
|
||||
|
||||
s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
|
||||
if (fa.fsx_xflags)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
s.projid = fa.fsx_projid;
|
||||
|
||||
ret = mnt_want_write_file(file);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
inode_lock(&inode->v);
|
||||
if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
|
||||
ret = -EACCES;
|
||||
goto err;
|
||||
}
|
||||
|
||||
mutex_lock(&inode->ei_update_lock);
|
||||
ret = bch2_set_projid(c, inode, fa.fsx_projid);
|
||||
if (ret)
|
||||
goto err_unlock;
|
||||
|
||||
ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
|
||||
if (!ret)
|
||||
bch2_inode_flags_to_vfs(inode);
|
||||
err_unlock:
|
||||
mutex_unlock(&inode->ei_update_lock);
|
||||
err:
|
||||
inode_unlock(&inode->v);
|
||||
mnt_drop_write_file(file);
|
||||
return ret;
|
||||
}
|
||||
|
||||
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
{
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
struct super_block *sb = inode->v.i_sb;
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
|
||||
switch (cmd) {
|
||||
case FS_IOC_GETFLAGS:
|
||||
return bch2_ioc_getflags(inode, (int __user *) arg);
|
||||
|
||||
case FS_IOC_SETFLAGS:
|
||||
return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
|
||||
|
||||
case FS_IOC_FSGETXATTR:
|
||||
return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
|
||||
case FS_IOC_FSSETXATTR:
|
||||
return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
|
||||
|
||||
case FS_IOC_GETVERSION:
|
||||
return -ENOTTY;
|
||||
case FS_IOC_SETVERSION:
|
||||
return -ENOTTY;
|
||||
|
||||
case FS_IOC_GOINGDOWN:
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
down_write(&sb->s_umount);
|
||||
sb->s_flags |= SB_RDONLY;
|
||||
bch2_fs_emergency_read_only(c);
|
||||
up_write(&sb->s_umount);
|
||||
return 0;
|
||||
|
||||
default:
|
||||
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
{
|
||||
/* These are just misnamed, they actually get/put from/to user an int */
|
||||
switch (cmd) {
|
||||
case FS_IOC_GETFLAGS:
|
||||
cmd = FS_IOC_GETFLAGS;
|
||||
break;
|
||||
case FS_IOC32_SETFLAGS:
|
||||
cmd = FS_IOC_SETFLAGS;
|
||||
break;
|
||||
default:
|
||||
return -ENOIOCTLCMD;
|
||||
}
|
||||
return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* NO_BCACHEFS_FS */
|
10
fs/bcachefs/fs-ioctl.h
Normal file
10
fs/bcachefs/fs-ioctl.h
Normal file
@ -0,0 +1,10 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FS_IOCTL_H
|
||||
#define _BCACHEFS_FS_IOCTL_H
|
||||
|
||||
void bch2_inode_flags_to_vfs(struct bch_inode_info *);
|
||||
|
||||
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
|
||||
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
|
||||
|
||||
#endif /* _BCACHEFS_FS_IOCTL_H */
|
1773
fs/bcachefs/fs.c
Normal file
1773
fs/bcachefs/fs.c
Normal file
File diff suppressed because it is too large
Load Diff
99
fs/bcachefs/fs.h
Normal file
99
fs/bcachefs/fs.h
Normal file
@ -0,0 +1,99 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FS_H
|
||||
#define _BCACHEFS_FS_H
|
||||
|
||||
#include "opts.h"
|
||||
#include "str_hash.h"
|
||||
#include "quota_types.h"
|
||||
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/stat.h>
|
||||
|
||||
/*
|
||||
* Two-state lock - can be taken for add or block - both states are shared,
|
||||
* like read side of rwsem, but conflict with other state:
|
||||
*/
|
||||
struct pagecache_lock {
|
||||
atomic_long_t v;
|
||||
wait_queue_head_t wait;
|
||||
};
|
||||
|
||||
static inline void pagecache_lock_init(struct pagecache_lock *lock)
|
||||
{
|
||||
atomic_long_set(&lock->v, 0);
|
||||
init_waitqueue_head(&lock->wait);
|
||||
}
|
||||
|
||||
void bch2_pagecache_add_put(struct pagecache_lock *);
|
||||
void bch2_pagecache_add_get(struct pagecache_lock *);
|
||||
void bch2_pagecache_block_put(struct pagecache_lock *);
|
||||
void bch2_pagecache_block_get(struct pagecache_lock *);
|
||||
|
||||
struct bch_inode_info {
|
||||
struct inode v;
|
||||
|
||||
struct mutex ei_update_lock;
|
||||
u64 ei_journal_seq;
|
||||
u64 ei_quota_reserved;
|
||||
unsigned long ei_last_dirtied;
|
||||
struct pagecache_lock ei_pagecache_lock;
|
||||
|
||||
struct mutex ei_quota_lock;
|
||||
struct bch_qid ei_qid;
|
||||
|
||||
struct bch_hash_info ei_str_hash;
|
||||
|
||||
/* copy of inode in btree: */
|
||||
struct bch_inode_unpacked ei_inode;
|
||||
};
|
||||
|
||||
#define to_bch_ei(_inode) \
|
||||
container_of_or_null(_inode, struct bch_inode_info, v)
|
||||
|
||||
static inline struct bch_inode_info *file_bch_inode(struct file *file)
|
||||
{
|
||||
return to_bch_ei(file_inode(file));
|
||||
}
|
||||
|
||||
static inline u8 mode_to_type(umode_t mode)
|
||||
{
|
||||
return (mode >> 12) & 15;
|
||||
}
|
||||
|
||||
static inline unsigned nlink_bias(umode_t mode)
|
||||
{
|
||||
return S_ISDIR(mode) ? 2 : 1;
|
||||
}
|
||||
|
||||
struct bch_inode_unpacked;
|
||||
|
||||
#ifndef NO_BCACHEFS_FS
|
||||
|
||||
/* returns 0 if we want to do the update, or error is passed up */
|
||||
typedef int (*inode_set_fn)(struct bch_inode_info *,
|
||||
struct bch_inode_unpacked *, void *);
|
||||
|
||||
void bch2_inode_update_after_write(struct bch_fs *,
|
||||
struct bch_inode_info *,
|
||||
struct bch_inode_unpacked *,
|
||||
unsigned);
|
||||
int __must_check bch2_write_inode_trans(struct btree_trans *,
|
||||
struct bch_inode_info *,
|
||||
struct bch_inode_unpacked *,
|
||||
inode_set_fn, void *);
|
||||
int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
|
||||
inode_set_fn, void *, unsigned);
|
||||
int __must_check bch2_write_inode(struct bch_fs *,
|
||||
struct bch_inode_info *);
|
||||
|
||||
void bch2_vfs_exit(void);
|
||||
int bch2_vfs_init(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline void bch2_vfs_exit(void) {}
|
||||
static inline int bch2_vfs_init(void) { return 0; }
|
||||
|
||||
#endif /* NO_BCACHEFS_FS */
|
||||
|
||||
#endif /* _BCACHEFS_FS_H */
|
1306
fs/bcachefs/fsck.c
Normal file
1306
fs/bcachefs/fsck.c
Normal file
File diff suppressed because it is too large
Load Diff
8
fs/bcachefs/fsck.h
Normal file
8
fs/bcachefs/fsck.h
Normal file
@ -0,0 +1,8 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_FSCK_H
|
||||
#define _BCACHEFS_FSCK_H
|
||||
|
||||
s64 bch2_count_inode_sectors(struct bch_fs *, u64);
|
||||
int bch2_fsck(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_FSCK_H */
|
517
fs/bcachefs/inode.c
Normal file
517
fs/bcachefs/inode.c
Normal file
@ -0,0 +1,517 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "bkey_methods.h"
|
||||
#include "btree_update.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
#include "keylist.h"
|
||||
|
||||
#include <linux/random.h>
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#define FIELD_BYTES() \
|
||||
|
||||
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
|
||||
static const u8 bits_table[8] = {
|
||||
1 * 8 - 1,
|
||||
2 * 8 - 2,
|
||||
3 * 8 - 3,
|
||||
4 * 8 - 4,
|
||||
6 * 8 - 5,
|
||||
8 * 8 - 6,
|
||||
10 * 8 - 7,
|
||||
13 * 8 - 8,
|
||||
};
|
||||
|
||||
static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
|
||||
{
|
||||
__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
|
||||
unsigned shift, bytes, bits = likely(!hi)
|
||||
? fls64(lo)
|
||||
: fls64(hi) + 64;
|
||||
|
||||
for (shift = 1; shift <= 8; shift++)
|
||||
if (bits < bits_table[shift - 1])
|
||||
goto got_shift;
|
||||
|
||||
BUG();
|
||||
got_shift:
|
||||
bytes = byte_table[shift - 1];
|
||||
|
||||
BUG_ON(out + bytes > end);
|
||||
|
||||
memcpy(out, (u8 *) in + 16 - bytes, bytes);
|
||||
*out |= (1 << 8) >> shift;
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static int inode_decode_field(const u8 *in, const u8 *end,
|
||||
u64 out[2], unsigned *out_bits)
|
||||
{
|
||||
__be64 be[2] = { 0, 0 };
|
||||
unsigned bytes, shift;
|
||||
u8 *p;
|
||||
|
||||
if (in >= end)
|
||||
return -1;
|
||||
|
||||
if (!*in)
|
||||
return -1;
|
||||
|
||||
/*
|
||||
* position of highest set bit indicates number of bytes:
|
||||
* shift = number of bits to remove in high byte:
|
||||
*/
|
||||
shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
|
||||
bytes = byte_table[shift - 1];
|
||||
|
||||
if (in + bytes > end)
|
||||
return -1;
|
||||
|
||||
p = (u8 *) be + 16 - bytes;
|
||||
memcpy(p, in, bytes);
|
||||
*p ^= (1 << 8) >> shift;
|
||||
|
||||
out[0] = be64_to_cpu(be[0]);
|
||||
out[1] = be64_to_cpu(be[1]);
|
||||
*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
void bch2_inode_pack(struct bkey_inode_buf *packed,
|
||||
const struct bch_inode_unpacked *inode)
|
||||
{
|
||||
u8 *out = packed->inode.v.fields;
|
||||
u8 *end = (void *) &packed[1];
|
||||
u8 *last_nonzero_field = out;
|
||||
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
|
||||
|
||||
bkey_inode_init(&packed->inode.k_i);
|
||||
packed->inode.k.p.inode = inode->bi_inum;
|
||||
packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
|
||||
packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
|
||||
packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
|
||||
|
||||
#define BCH_INODE_FIELD(_name, _bits) \
|
||||
out += inode_encode_field(out, end, 0, inode->_name); \
|
||||
nr_fields++; \
|
||||
\
|
||||
if (inode->_name) { \
|
||||
last_nonzero_field = out; \
|
||||
last_nonzero_fieldnr = nr_fields; \
|
||||
}
|
||||
|
||||
BCH_INODE_FIELDS()
|
||||
#undef BCH_INODE_FIELD
|
||||
|
||||
out = last_nonzero_field;
|
||||
nr_fields = last_nonzero_fieldnr;
|
||||
|
||||
set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
|
||||
memset(out, 0,
|
||||
(u8 *) &packed->inode.v +
|
||||
bkey_val_bytes(&packed->inode.k) - out);
|
||||
|
||||
SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
|
||||
|
||||
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
|
||||
struct bch_inode_unpacked unpacked;
|
||||
|
||||
int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
|
||||
&unpacked);
|
||||
BUG_ON(ret);
|
||||
BUG_ON(unpacked.bi_inum != inode->bi_inum);
|
||||
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
|
||||
BUG_ON(unpacked.bi_mode != inode->bi_mode);
|
||||
|
||||
#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name);
|
||||
BCH_INODE_FIELDS()
|
||||
#undef BCH_INODE_FIELD
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_inode_unpack(struct bkey_s_c_inode inode,
|
||||
struct bch_inode_unpacked *unpacked)
|
||||
{
|
||||
const u8 *in = inode.v->fields;
|
||||
const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
|
||||
u64 field[2];
|
||||
unsigned fieldnr = 0, field_bits;
|
||||
int ret;
|
||||
|
||||
unpacked->bi_inum = inode.k->p.inode;
|
||||
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
||||
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
|
||||
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
|
||||
|
||||
#define BCH_INODE_FIELD(_name, _bits) \
|
||||
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
|
||||
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
|
||||
memset((void *) unpacked + offset, 0, \
|
||||
sizeof(*unpacked) - offset); \
|
||||
return 0; \
|
||||
} \
|
||||
\
|
||||
ret = inode_decode_field(in, end, field, &field_bits); \
|
||||
if (ret < 0) \
|
||||
return ret; \
|
||||
\
|
||||
if (field_bits > sizeof(unpacked->_name) * 8) \
|
||||
return -1; \
|
||||
\
|
||||
unpacked->_name = field[1]; \
|
||||
in += ret;
|
||||
|
||||
BCH_INODE_FIELDS()
|
||||
#undef BCH_INODE_FIELD
|
||||
|
||||
/* XXX: signal if there were more fields than expected? */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
if (k.k->p.offset)
|
||||
return "nonzero offset";
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_FS: {
|
||||
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
|
||||
struct bch_inode_unpacked unpacked;
|
||||
|
||||
if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
|
||||
return "incorrect value size";
|
||||
|
||||
if (k.k->p.inode < BLOCKDEV_INODE_MAX)
|
||||
return "fs inode in blockdev range";
|
||||
|
||||
if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
|
||||
return "invalid str hash type";
|
||||
|
||||
if (bch2_inode_unpack(inode, &unpacked))
|
||||
return "invalid variable length fields";
|
||||
|
||||
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
|
||||
return "invalid data checksum type";
|
||||
|
||||
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
|
||||
return "invalid data checksum type";
|
||||
|
||||
if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
|
||||
unpacked.bi_nlink != 0)
|
||||
return "flagged as unlinked but bi_nlink != 0";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
case BCH_INODE_BLOCKDEV:
|
||||
if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
|
||||
return "incorrect value size";
|
||||
|
||||
if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
|
||||
return "blockdev inode in fs range";
|
||||
|
||||
return NULL;
|
||||
case BCH_INODE_GENERATION:
|
||||
if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
|
||||
return "incorrect value size";
|
||||
|
||||
return NULL;
|
||||
default:
|
||||
return "invalid type";
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_inode_to_text(struct bch_fs *c, char *buf,
|
||||
size_t size, struct bkey_s_c k)
|
||||
{
|
||||
char *out = buf, *end = out + size;
|
||||
struct bkey_s_c_inode inode;
|
||||
struct bch_inode_unpacked unpacked;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_FS:
|
||||
inode = bkey_s_c_to_inode(k);
|
||||
if (bch2_inode_unpack(inode, &unpacked)) {
|
||||
out += scnprintf(out, end - out, "(unpack error)");
|
||||
break;
|
||||
}
|
||||
|
||||
#define BCH_INODE_FIELD(_name, _bits) \
|
||||
out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
|
||||
BCH_INODE_FIELDS()
|
||||
#undef BCH_INODE_FIELD
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
||||
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
||||
struct bch_inode_unpacked *parent)
|
||||
{
|
||||
s64 now = bch2_current_time(c);
|
||||
|
||||
memset(inode_u, 0, sizeof(*inode_u));
|
||||
|
||||
/* ick */
|
||||
inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
|
||||
get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
|
||||
|
||||
inode_u->bi_mode = mode;
|
||||
inode_u->bi_uid = uid;
|
||||
inode_u->bi_gid = gid;
|
||||
inode_u->bi_dev = rdev;
|
||||
inode_u->bi_atime = now;
|
||||
inode_u->bi_mtime = now;
|
||||
inode_u->bi_ctime = now;
|
||||
inode_u->bi_otime = now;
|
||||
|
||||
if (parent) {
|
||||
#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
|
||||
BCH_INODE_FIELDS_INHERIT()
|
||||
#undef BCH_INODE_FIELD
|
||||
}
|
||||
}
|
||||
|
||||
static inline u32 bkey_generation(struct bkey_s_c k)
|
||||
{
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_BLOCKDEV:
|
||||
case BCH_INODE_FS:
|
||||
BUG();
|
||||
case BCH_INODE_GENERATION:
|
||||
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int __bch2_inode_create(struct btree_trans *trans,
|
||||
struct bch_inode_unpacked *inode_u,
|
||||
u64 min, u64 max, u64 *hint)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bkey_inode_buf *inode_p;
|
||||
struct btree_iter *iter;
|
||||
u64 start;
|
||||
int ret;
|
||||
|
||||
if (!max)
|
||||
max = ULLONG_MAX;
|
||||
|
||||
if (c->opts.inodes_32bit)
|
||||
max = min_t(u64, max, U32_MAX);
|
||||
|
||||
start = READ_ONCE(*hint);
|
||||
|
||||
if (start >= max || start < min)
|
||||
start = min;
|
||||
|
||||
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
|
||||
if (IS_ERR(inode_p))
|
||||
return PTR_ERR(inode_p);
|
||||
|
||||
iter = bch2_trans_get_iter(trans,
|
||||
BTREE_ID_INODES, POS(start, 0),
|
||||
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
||||
if (IS_ERR(iter))
|
||||
return PTR_ERR(iter);
|
||||
again:
|
||||
while (1) {
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
||||
|
||||
ret = btree_iter_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_BLOCKDEV:
|
||||
case BCH_INODE_FS:
|
||||
/* slot used */
|
||||
if (iter->pos.inode >= max)
|
||||
goto out;
|
||||
|
||||
bch2_btree_iter_next_slot(iter);
|
||||
break;
|
||||
|
||||
default:
|
||||
*hint = k.k->p.inode;
|
||||
inode_u->bi_inum = k.k->p.inode;
|
||||
inode_u->bi_generation = bkey_generation(k);
|
||||
|
||||
bch2_inode_pack(inode_p, inode_u);
|
||||
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
out:
|
||||
if (start != min) {
|
||||
/* Retry from start */
|
||||
start = min;
|
||||
bch2_btree_iter_set_pos(iter, POS(start, 0));
|
||||
goto again;
|
||||
}
|
||||
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
||||
u64 min, u64 max, u64 *hint)
|
||||
{
|
||||
return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
|
||||
__bch2_inode_create(&trans, inode_u, min, max, hint));
|
||||
}
|
||||
|
||||
int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
|
||||
struct extent_insert_hook *hook, u64 *journal_seq)
|
||||
{
|
||||
return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
|
||||
POS(inode_nr, new_size),
|
||||
POS(inode_nr + 1, 0),
|
||||
ZERO_VERSION, NULL, hook,
|
||||
journal_seq);
|
||||
}
|
||||
|
||||
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_i_inode_generation delete;
|
||||
int ret;
|
||||
|
||||
ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
|
||||
POS(inode_nr, 0),
|
||||
POS(inode_nr + 1, 0),
|
||||
ZERO_VERSION, NULL, NULL, NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* If this was a directory, there shouldn't be any real dirents left -
|
||||
* but there could be whiteouts (from hash collisions) that we should
|
||||
* delete:
|
||||
*
|
||||
* XXX: the dirent could ideally would delete whiteouts when they're no
|
||||
* longer needed
|
||||
*/
|
||||
ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
|
||||
POS(inode_nr, 0),
|
||||
POS(inode_nr + 1, 0),
|
||||
ZERO_VERSION, NULL, NULL, NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
|
||||
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
||||
do {
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
|
||||
u32 bi_generation = 0;
|
||||
|
||||
ret = btree_iter_err(k);
|
||||
if (ret) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
|
||||
"inode %llu not found when deleting",
|
||||
inode_nr);
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_FS: {
|
||||
struct bch_inode_unpacked inode_u;
|
||||
|
||||
if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
|
||||
bi_generation = inode_u.bi_generation + 1;
|
||||
break;
|
||||
}
|
||||
case BCH_INODE_GENERATION: {
|
||||
struct bkey_s_c_inode_generation g =
|
||||
bkey_s_c_to_inode_generation(k);
|
||||
bi_generation = le32_to_cpu(g.v->bi_generation);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!bi_generation) {
|
||||
bkey_init(&delete.k);
|
||||
delete.k.p.inode = inode_nr;
|
||||
} else {
|
||||
bkey_inode_generation_init(&delete.k_i);
|
||||
delete.k.p.inode = inode_nr;
|
||||
delete.v.bi_generation = cpu_to_le32(bi_generation);
|
||||
}
|
||||
|
||||
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL,
|
||||
BTREE_INSERT_ENTRY(&iter, &delete.k_i));
|
||||
} while (ret == -EINTR);
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
|
||||
struct bch_inode_unpacked *inode)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = -ENOENT;
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_INODES,
|
||||
POS(inode_nr, 0),
|
||||
BTREE_ITER_SLOTS, k) {
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_FS:
|
||||
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
|
||||
break;
|
||||
default:
|
||||
/* hole, not found */
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return bch2_btree_iter_unlock(&iter) ?: ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_inode_pack_test(void)
|
||||
{
|
||||
struct bch_inode_unpacked *u, test_inodes[] = {
|
||||
{
|
||||
.bi_atime = U64_MAX,
|
||||
.bi_ctime = U64_MAX,
|
||||
.bi_mtime = U64_MAX,
|
||||
.bi_otime = U64_MAX,
|
||||
.bi_size = U64_MAX,
|
||||
.bi_sectors = U64_MAX,
|
||||
.bi_uid = U32_MAX,
|
||||
.bi_gid = U32_MAX,
|
||||
.bi_nlink = U32_MAX,
|
||||
.bi_generation = U32_MAX,
|
||||
.bi_dev = U32_MAX,
|
||||
},
|
||||
};
|
||||
|
||||
for (u = test_inodes;
|
||||
u < test_inodes + ARRAY_SIZE(test_inodes);
|
||||
u++) {
|
||||
struct bkey_inode_buf p;
|
||||
|
||||
bch2_inode_pack(&p, u);
|
||||
}
|
||||
}
|
||||
#endif
|
101
fs/bcachefs/inode.h
Normal file
101
fs/bcachefs/inode.h
Normal file
@ -0,0 +1,101 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_INODE_H
|
||||
#define _BCACHEFS_INODE_H
|
||||
|
||||
#include "opts.h"
|
||||
|
||||
#include <linux/math64.h>
|
||||
|
||||
const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
|
||||
|
||||
#define bch2_bkey_inode_ops (struct bkey_ops) { \
|
||||
.key_invalid = bch2_inode_invalid, \
|
||||
.val_to_text = bch2_inode_to_text, \
|
||||
}
|
||||
|
||||
struct bch_inode_unpacked {
|
||||
u64 bi_inum;
|
||||
__le64 bi_hash_seed;
|
||||
u32 bi_flags;
|
||||
u16 bi_mode;
|
||||
|
||||
#define BCH_INODE_FIELD(_name, _bits) u##_bits _name;
|
||||
BCH_INODE_FIELDS()
|
||||
#undef BCH_INODE_FIELD
|
||||
};
|
||||
|
||||
struct bkey_inode_buf {
|
||||
struct bkey_i_inode inode;
|
||||
|
||||
#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8
|
||||
u8 _pad[0 + BCH_INODE_FIELDS()];
|
||||
#undef BCH_INODE_FIELD
|
||||
} __attribute__((packed, aligned(8)));
|
||||
|
||||
void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
|
||||
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
|
||||
|
||||
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
|
||||
uid_t, gid_t, umode_t, dev_t,
|
||||
struct bch_inode_unpacked *);
|
||||
|
||||
int __bch2_inode_create(struct btree_trans *,
|
||||
struct bch_inode_unpacked *,
|
||||
u64, u64, u64 *);
|
||||
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
|
||||
u64, u64, u64 *);
|
||||
|
||||
int bch2_inode_truncate(struct bch_fs *, u64, u64,
|
||||
struct extent_insert_hook *, u64 *);
|
||||
int bch2_inode_rm(struct bch_fs *, u64);
|
||||
|
||||
int bch2_inode_find_by_inum(struct bch_fs *, u64,
|
||||
struct bch_inode_unpacked *);
|
||||
|
||||
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
|
||||
{
|
||||
struct bch_io_opts ret = { 0 };
|
||||
|
||||
#define BCH_INODE_OPT(_name, _bits) \
|
||||
if (inode->bi_##_name) \
|
||||
opt_set(ret, _name, inode->bi_##_name - 1);
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
|
||||
enum bch_opt_id id, u64 v)
|
||||
{
|
||||
switch (id) {
|
||||
#define BCH_INODE_OPT(_name, ...) \
|
||||
case Opt_##_name: \
|
||||
inode->bi_##_name = v; \
|
||||
break;
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
|
||||
enum bch_opt_id id, u64 v)
|
||||
{
|
||||
return __bch2_inode_opt_set(inode, id, v + 1);
|
||||
}
|
||||
|
||||
static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
|
||||
enum bch_opt_id id)
|
||||
{
|
||||
return __bch2_inode_opt_set(inode, id, 0);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_inode_pack_test(void);
|
||||
#else
|
||||
static inline void bch2_inode_pack_test(void) {}
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_INODE_H */
|
1875
fs/bcachefs/io.c
Normal file
1875
fs/bcachefs/io.c
Normal file
File diff suppressed because it is too large
Load Diff
144
fs/bcachefs/io.h
Normal file
144
fs/bcachefs/io.h
Normal file
@ -0,0 +1,144 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_IO_H
|
||||
#define _BCACHEFS_IO_H
|
||||
|
||||
#include "alloc.h"
|
||||
#include "checksum.h"
|
||||
#include "io_types.h"
|
||||
|
||||
#define to_wbio(_bio) \
|
||||
container_of((_bio), struct bch_write_bio, bio)
|
||||
|
||||
#define to_rbio(_bio) \
|
||||
container_of((_bio), struct bch_read_bio, bio)
|
||||
|
||||
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
||||
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
|
||||
|
||||
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
|
||||
void bch2_latency_acct(struct bch_dev *, u64, int);
|
||||
#else
|
||||
static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
|
||||
#endif
|
||||
|
||||
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
||||
enum bch_data_type, const struct bkey_i *);
|
||||
|
||||
#define BLK_STS_REMOVED ((__force blk_status_t)128)
|
||||
|
||||
enum bch_write_flags {
|
||||
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
|
||||
BCH_WRITE_CACHED = (1 << 1),
|
||||
BCH_WRITE_FLUSH = (1 << 2),
|
||||
BCH_WRITE_DATA_ENCODED = (1 << 3),
|
||||
BCH_WRITE_PAGES_STABLE = (1 << 4),
|
||||
BCH_WRITE_PAGES_OWNED = (1 << 5),
|
||||
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
|
||||
BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
|
||||
BCH_WRITE_NOMARK_REPLICAS = (1 << 8),
|
||||
|
||||
/* Internal: */
|
||||
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
|
||||
};
|
||||
|
||||
static inline u64 *op_journal_seq(struct bch_write_op *op)
|
||||
{
|
||||
return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
|
||||
? op->journal_seq_p : &op->journal_seq;
|
||||
}
|
||||
|
||||
static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
|
||||
{
|
||||
op->journal_seq_p = journal_seq;
|
||||
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
|
||||
}
|
||||
|
||||
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
|
||||
{
|
||||
return op->alloc_reserve == RESERVE_MOVINGGC
|
||||
? op->c->copygc_wq
|
||||
: op->c->wq;
|
||||
}
|
||||
|
||||
int bch2_write_index_default(struct bch_write_op *);
|
||||
|
||||
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
||||
struct bch_io_opts opts)
|
||||
{
|
||||
op->c = c;
|
||||
op->io_wq = index_update_wq(op);
|
||||
op->flags = 0;
|
||||
op->written = 0;
|
||||
op->error = 0;
|
||||
op->csum_type = bch2_data_checksum_type(c, opts.data_checksum);
|
||||
op->compression_type = bch2_compression_opt_to_type[opts.compression];
|
||||
op->nr_replicas = 0;
|
||||
op->nr_replicas_required = c->opts.data_replicas_required;
|
||||
op->alloc_reserve = RESERVE_NONE;
|
||||
op->open_buckets_nr = 0;
|
||||
op->devs_have.nr = 0;
|
||||
op->target = 0;
|
||||
op->opts = opts;
|
||||
op->pos = POS_MAX;
|
||||
op->version = ZERO_VERSION;
|
||||
op->write_point = (struct write_point_specifier) { 0 };
|
||||
op->res = (struct disk_reservation) { 0 };
|
||||
op->journal_seq = 0;
|
||||
op->index_update_fn = bch2_write_index_default;
|
||||
}
|
||||
|
||||
void bch2_write(struct closure *);
|
||||
|
||||
static inline struct bch_write_bio *wbio_init(struct bio *bio)
|
||||
{
|
||||
struct bch_write_bio *wbio = to_wbio(bio);
|
||||
|
||||
memset(&wbio->wbio, 0, sizeof(wbio->wbio));
|
||||
return wbio;
|
||||
}
|
||||
|
||||
struct bch_devs_mask;
|
||||
struct cache_promote_op;
|
||||
struct extent_pick_ptr;
|
||||
|
||||
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
||||
struct bkey_s_c, struct bch_devs_mask *, unsigned);
|
||||
void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
|
||||
|
||||
enum bch_read_flags {
|
||||
BCH_READ_RETRY_IF_STALE = 1 << 0,
|
||||
BCH_READ_MAY_PROMOTE = 1 << 1,
|
||||
BCH_READ_USER_MAPPED = 1 << 2,
|
||||
BCH_READ_NODECODE = 1 << 3,
|
||||
BCH_READ_LAST_FRAGMENT = 1 << 4,
|
||||
|
||||
/* internal: */
|
||||
BCH_READ_MUST_BOUNCE = 1 << 5,
|
||||
BCH_READ_MUST_CLONE = 1 << 6,
|
||||
BCH_READ_IN_RETRY = 1 << 7,
|
||||
};
|
||||
|
||||
static inline void bch2_read_extent(struct bch_fs *c,
|
||||
struct bch_read_bio *rbio,
|
||||
struct bkey_s_c k,
|
||||
unsigned flags)
|
||||
{
|
||||
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
|
||||
}
|
||||
|
||||
static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
||||
struct bch_io_opts opts)
|
||||
{
|
||||
struct bch_read_bio *rbio = to_rbio(bio);
|
||||
|
||||
rbio->_state = 0;
|
||||
rbio->promote = NULL;
|
||||
rbio->opts = opts;
|
||||
return rbio;
|
||||
}
|
||||
|
||||
void bch2_fs_io_exit(struct bch_fs *);
|
||||
int bch2_fs_io_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_IO_H */
|
148
fs/bcachefs/io_types.h
Normal file
148
fs/bcachefs/io_types.h
Normal file
@ -0,0 +1,148 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_IO_TYPES_H
|
||||
#define _BCACHEFS_IO_TYPES_H
|
||||
|
||||
#include "alloc_types.h"
|
||||
#include "btree_types.h"
|
||||
#include "buckets_types.h"
|
||||
#include "extents_types.h"
|
||||
#include "keylist_types.h"
|
||||
#include "opts.h"
|
||||
#include "super_types.h"
|
||||
|
||||
#include <linux/llist.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
struct bch_read_bio {
|
||||
struct bch_fs *c;
|
||||
u64 start_time;
|
||||
u64 submit_time;
|
||||
|
||||
/*
|
||||
* Reads will often have to be split, and if the extent being read from
|
||||
* was checksummed or compressed we'll also have to allocate bounce
|
||||
* buffers and copy the data back into the original bio.
|
||||
*
|
||||
* If we didn't have to split, we have to save and restore the original
|
||||
* bi_end_io - @split below indicates which:
|
||||
*/
|
||||
union {
|
||||
struct bch_read_bio *parent;
|
||||
bio_end_io_t *end_io;
|
||||
};
|
||||
|
||||
/*
|
||||
* Saved copy of bio->bi_iter, from submission time - allows us to
|
||||
* resubmit on IO error, and also to copy data back to the original bio
|
||||
* when we're bouncing:
|
||||
*/
|
||||
struct bvec_iter bvec_iter;
|
||||
|
||||
u16 flags;
|
||||
union {
|
||||
struct {
|
||||
u16 bounce:1,
|
||||
split:1,
|
||||
kmalloc:1,
|
||||
have_ioref:1,
|
||||
narrow_crcs:1,
|
||||
hole:1,
|
||||
retry:2,
|
||||
context:2;
|
||||
};
|
||||
u16 _state;
|
||||
};
|
||||
|
||||
struct bch_devs_list devs_have;
|
||||
|
||||
struct extent_pick_ptr pick;
|
||||
/* start pos of data we read (may not be pos of data we want) */
|
||||
struct bpos pos;
|
||||
struct bversion version;
|
||||
|
||||
struct promote_op *promote;
|
||||
|
||||
struct bch_io_opts opts;
|
||||
|
||||
struct work_struct work;
|
||||
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
struct bch_write_bio {
|
||||
struct_group(wbio,
|
||||
struct bch_fs *c;
|
||||
struct bch_write_bio *parent;
|
||||
|
||||
u64 submit_time;
|
||||
|
||||
struct bch_devs_list failed;
|
||||
u8 order;
|
||||
u8 dev;
|
||||
|
||||
unsigned split:1,
|
||||
bounce:1,
|
||||
put_bio:1,
|
||||
have_ioref:1,
|
||||
used_mempool:1;
|
||||
);
|
||||
|
||||
struct bio bio;
|
||||
};
|
||||
|
||||
struct bch_write_op {
|
||||
struct closure cl;
|
||||
struct bch_fs *c;
|
||||
struct workqueue_struct *io_wq;
|
||||
u64 start_time;
|
||||
|
||||
unsigned written; /* sectors */
|
||||
u16 flags;
|
||||
s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
|
||||
|
||||
unsigned csum_type:4;
|
||||
unsigned compression_type:4;
|
||||
unsigned nr_replicas:4;
|
||||
unsigned nr_replicas_required:4;
|
||||
unsigned alloc_reserve:4;
|
||||
|
||||
u8 open_buckets_nr;
|
||||
struct bch_devs_list devs_have;
|
||||
u16 target;
|
||||
u16 nonce;
|
||||
|
||||
struct bch_io_opts opts;
|
||||
|
||||
struct bpos pos;
|
||||
struct bversion version;
|
||||
|
||||
/* For BCH_WRITE_DATA_ENCODED: */
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
struct write_point_specifier write_point;
|
||||
|
||||
struct disk_reservation res;
|
||||
|
||||
u8 open_buckets[16];
|
||||
|
||||
/*
|
||||
* If caller wants to flush but hasn't passed us a journal_seq ptr, we
|
||||
* still need to stash the journal_seq somewhere:
|
||||
*/
|
||||
union {
|
||||
u64 *journal_seq_p;
|
||||
u64 journal_seq;
|
||||
};
|
||||
|
||||
int (*index_update_fn)(struct bch_write_op *);
|
||||
|
||||
struct bch_devs_mask failed;
|
||||
|
||||
struct keylist insert_keys;
|
||||
u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
|
||||
|
||||
/* Must be last: */
|
||||
struct bch_write_bio wbio;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_IO_TYPES_H */
|
1140
fs/bcachefs/journal.c
Normal file
1140
fs/bcachefs/journal.c
Normal file
File diff suppressed because it is too large
Load Diff
383
fs/bcachefs/journal.h
Normal file
383
fs/bcachefs/journal.h
Normal file
@ -0,0 +1,383 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_JOURNAL_H
|
||||
#define _BCACHEFS_JOURNAL_H
|
||||
|
||||
/*
|
||||
* THE JOURNAL:
|
||||
*
|
||||
* The primary purpose of the journal is to log updates (insertions) to the
|
||||
* b-tree, to avoid having to do synchronous updates to the b-tree on disk.
|
||||
*
|
||||
* Without the journal, the b-tree is always internally consistent on
|
||||
* disk - and in fact, in the earliest incarnations bcache didn't have a journal
|
||||
* but did handle unclean shutdowns by doing all index updates synchronously
|
||||
* (with coalescing).
|
||||
*
|
||||
* Updates to interior nodes still happen synchronously and without the journal
|
||||
* (for simplicity) - this may change eventually but updates to interior nodes
|
||||
* are rare enough it's not a huge priority.
|
||||
*
|
||||
* This means the journal is relatively separate from the b-tree; it consists of
|
||||
* just a list of keys and journal replay consists of just redoing those
|
||||
* insertions in same order that they appear in the journal.
|
||||
*
|
||||
* PERSISTENCE:
|
||||
*
|
||||
* For synchronous updates (where we're waiting on the index update to hit
|
||||
* disk), the journal entry will be written out immediately (or as soon as
|
||||
* possible, if the write for the previous journal entry was still in flight).
|
||||
*
|
||||
* Synchronous updates are specified by passing a closure (@flush_cl) to
|
||||
* bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
|
||||
* down to the journalling code. That closure will will wait on the journal
|
||||
* write to complete (via closure_wait()).
|
||||
*
|
||||
* If the index update wasn't synchronous, the journal entry will be
|
||||
* written out after 10 ms have elapsed, by default (the delay_ms field
|
||||
* in struct journal).
|
||||
*
|
||||
* JOURNAL ENTRIES:
|
||||
*
|
||||
* A journal entry is variable size (struct jset), it's got a fixed length
|
||||
* header and then a variable number of struct jset_entry entries.
|
||||
*
|
||||
* Journal entries are identified by monotonically increasing 64 bit sequence
|
||||
* numbers - jset->seq; other places in the code refer to this sequence number.
|
||||
*
|
||||
* A jset_entry entry contains one or more bkeys (which is what gets inserted
|
||||
* into the b-tree). We need a container to indicate which b-tree the key is
|
||||
* for; also, the roots of the various b-trees are stored in jset_entry entries
|
||||
* (one for each b-tree) - this lets us add new b-tree types without changing
|
||||
* the on disk format.
|
||||
*
|
||||
* We also keep some things in the journal header that are logically part of the
|
||||
* superblock - all the things that are frequently updated. This is for future
|
||||
* bcache on raw flash support; the superblock (which will become another
|
||||
* journal) can't be moved or wear leveled, so it contains just enough
|
||||
* information to find the main journal, and the superblock only has to be
|
||||
* rewritten when we want to move/wear level the main journal.
|
||||
*
|
||||
* JOURNAL LAYOUT ON DISK:
|
||||
*
|
||||
* The journal is written to a ringbuffer of buckets (which is kept in the
|
||||
* superblock); the individual buckets are not necessarily contiguous on disk
|
||||
* which means that journal entries are not allowed to span buckets, but also
|
||||
* that we can resize the journal at runtime if desired (unimplemented).
|
||||
*
|
||||
* The journal buckets exist in the same pool as all the other buckets that are
|
||||
* managed by the allocator and garbage collection - garbage collection marks
|
||||
* the journal buckets as metadata buckets.
|
||||
*
|
||||
* OPEN/DIRTY JOURNAL ENTRIES:
|
||||
*
|
||||
* Open/dirty journal entries are journal entries that contain b-tree updates
|
||||
* that have not yet been written out to the b-tree on disk. We have to track
|
||||
* which journal entries are dirty, and we also have to avoid wrapping around
|
||||
* the journal and overwriting old but still dirty journal entries with new
|
||||
* journal entries.
|
||||
*
|
||||
* On disk, this is represented with the "last_seq" field of struct jset;
|
||||
* last_seq is the first sequence number that journal replay has to replay.
|
||||
*
|
||||
* To avoid overwriting dirty journal entries on disk, we keep a mapping (in
|
||||
* journal_device->seq) of for each journal bucket, the highest sequence number
|
||||
* any journal entry it contains. Then, by comparing that against last_seq we
|
||||
* can determine whether that journal bucket contains dirty journal entries or
|
||||
* not.
|
||||
*
|
||||
* To track which journal entries are dirty, we maintain a fifo of refcounts
|
||||
* (where each entry corresponds to a specific sequence number) - when a ref
|
||||
* goes to 0, that journal entry is no longer dirty.
|
||||
*
|
||||
* Journalling of index updates is done at the same time as the b-tree itself is
|
||||
* being modified (see btree_insert_key()); when we add the key to the journal
|
||||
* the pending b-tree write takes a ref on the journal entry the key was added
|
||||
* to. If a pending b-tree write would need to take refs on multiple dirty
|
||||
* journal entries, it only keeps the ref on the oldest one (since a newer
|
||||
* journal entry will still be replayed if an older entry was dirty).
|
||||
*
|
||||
* JOURNAL FILLING UP:
|
||||
*
|
||||
* There are two ways the journal could fill up; either we could run out of
|
||||
* space to write to, or we could have too many open journal entries and run out
|
||||
* of room in the fifo of refcounts. Since those refcounts are decremented
|
||||
* without any locking we can't safely resize that fifo, so we handle it the
|
||||
* same way.
|
||||
*
|
||||
* If the journal fills up, we start flushing dirty btree nodes until we can
|
||||
* allocate space for a journal write again - preferentially flushing btree
|
||||
* nodes that are pinning the oldest journal entries first.
|
||||
*/
|
||||
|
||||
#include <linux/hash.h>
|
||||
|
||||
#include "journal_types.h"
|
||||
|
||||
struct bch_fs;
|
||||
|
||||
static inline void journal_wake(struct journal *j)
|
||||
{
|
||||
wake_up(&j->wait);
|
||||
closure_wake_up(&j->async_wait);
|
||||
}
|
||||
|
||||
static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
||||
{
|
||||
return j->buf + j->reservations.idx;
|
||||
}
|
||||
|
||||
static inline struct journal_buf *journal_prev_buf(struct journal *j)
|
||||
{
|
||||
return j->buf + !j->reservations.idx;
|
||||
}
|
||||
|
||||
/* Sequence number of oldest dirty journal entry */
|
||||
|
||||
static inline u64 journal_last_seq(struct journal *j)
|
||||
{
|
||||
return j->pin.front;
|
||||
}
|
||||
|
||||
static inline u64 journal_cur_seq(struct journal *j)
|
||||
{
|
||||
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
|
||||
|
||||
return j->pin.back - 1;
|
||||
}
|
||||
|
||||
u64 bch2_inode_journal_seq(struct journal *, u64);
|
||||
|
||||
static inline int journal_state_count(union journal_res_state s, int idx)
|
||||
{
|
||||
return idx == 0 ? s.buf0_count : s.buf1_count;
|
||||
}
|
||||
|
||||
static inline void journal_state_inc(union journal_res_state *s)
|
||||
{
|
||||
s->buf0_count += s->idx == 0;
|
||||
s->buf1_count += s->idx == 1;
|
||||
}
|
||||
|
||||
static inline void bch2_journal_set_has_inode(struct journal *j,
|
||||
struct journal_res *res,
|
||||
u64 inum)
|
||||
{
|
||||
struct journal_buf *buf = &j->buf[res->idx];
|
||||
unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
|
||||
|
||||
/* avoid atomic op if possible */
|
||||
if (unlikely(!test_bit(bit, buf->has_inode)))
|
||||
set_bit(bit, buf->has_inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Amount of space that will be taken up by some keys in the journal (i.e.
|
||||
* including the jset header)
|
||||
*/
|
||||
static inline unsigned jset_u64s(unsigned u64s)
|
||||
{
|
||||
return u64s + sizeof(struct jset_entry) / sizeof(u64);
|
||||
}
|
||||
|
||||
static inline struct jset_entry *
|
||||
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
|
||||
{
|
||||
struct jset *jset = buf->data;
|
||||
struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
|
||||
|
||||
memset(entry, 0, sizeof(*entry));
|
||||
entry->u64s = cpu_to_le16(u64s);
|
||||
|
||||
le32_add_cpu(&jset->u64s, jset_u64s(u64s));
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
|
||||
unsigned type, enum btree_id id,
|
||||
unsigned level,
|
||||
const void *data, unsigned u64s)
|
||||
{
|
||||
struct journal_buf *buf = &j->buf[res->idx];
|
||||
struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
|
||||
unsigned actual = jset_u64s(u64s);
|
||||
|
||||
EBUG_ON(!res->ref);
|
||||
EBUG_ON(actual > res->u64s);
|
||||
|
||||
res->offset += actual;
|
||||
res->u64s -= actual;
|
||||
|
||||
entry->u64s = cpu_to_le16(u64s);
|
||||
entry->btree_id = id;
|
||||
entry->level = level;
|
||||
entry->type = type;
|
||||
entry->pad[0] = 0;
|
||||
entry->pad[1] = 0;
|
||||
entry->pad[2] = 0;
|
||||
memcpy_u64s(entry->_data, data, u64s);
|
||||
}
|
||||
|
||||
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
|
||||
enum btree_id id, const struct bkey_i *k)
|
||||
{
|
||||
bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
|
||||
id, 0, k, k->k.u64s);
|
||||
}
|
||||
|
||||
void bch2_journal_buf_put_slowpath(struct journal *, bool);
|
||||
|
||||
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
|
||||
bool need_write_just_set)
|
||||
{
|
||||
union journal_res_state s;
|
||||
|
||||
s.v = atomic64_sub_return(((union journal_res_state) {
|
||||
.buf0_count = idx == 0,
|
||||
.buf1_count = idx == 1,
|
||||
}).v, &j->reservations.counter);
|
||||
|
||||
EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
|
||||
|
||||
/*
|
||||
* Do not initiate a journal write if the journal is in an error state
|
||||
* (previous journal entry write may have failed)
|
||||
*/
|
||||
if (s.idx != idx &&
|
||||
!journal_state_count(s, idx) &&
|
||||
s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
|
||||
bch2_journal_buf_put_slowpath(j, need_write_just_set);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function releases the journal write structure so other threads can
|
||||
* then proceed to add their keys as well.
|
||||
*/
|
||||
static inline void bch2_journal_res_put(struct journal *j,
|
||||
struct journal_res *res)
|
||||
{
|
||||
if (!res->ref)
|
||||
return;
|
||||
|
||||
lock_release(&j->res_map, _RET_IP_);
|
||||
|
||||
while (res->u64s)
|
||||
bch2_journal_add_entry(j, res,
|
||||
BCH_JSET_ENTRY_btree_keys,
|
||||
0, 0, NULL, 0);
|
||||
|
||||
bch2_journal_buf_put(j, res->idx, false);
|
||||
|
||||
res->ref = 0;
|
||||
}
|
||||
|
||||
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
|
||||
unsigned, unsigned);
|
||||
|
||||
static inline int journal_res_get_fast(struct journal *j,
|
||||
struct journal_res *res,
|
||||
unsigned u64s_min,
|
||||
unsigned u64s_max)
|
||||
{
|
||||
union journal_res_state old, new;
|
||||
u64 v = atomic64_read(&j->reservations.counter);
|
||||
|
||||
do {
|
||||
old.v = new.v = v;
|
||||
|
||||
/*
|
||||
* Check if there is still room in the current journal
|
||||
* entry:
|
||||
*/
|
||||
if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
|
||||
return 0;
|
||||
|
||||
res->offset = old.cur_entry_offset;
|
||||
res->u64s = min(u64s_max, j->cur_entry_u64s -
|
||||
old.cur_entry_offset);
|
||||
|
||||
journal_state_inc(&new);
|
||||
new.cur_entry_offset += res->u64s;
|
||||
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
||||
old.v, new.v)) != old.v);
|
||||
|
||||
res->ref = true;
|
||||
res->idx = new.idx;
|
||||
res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
|
||||
unsigned u64s_min, unsigned u64s_max)
|
||||
{
|
||||
int ret;
|
||||
|
||||
EBUG_ON(res->ref);
|
||||
EBUG_ON(u64s_max < u64s_min);
|
||||
EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
|
||||
|
||||
if (journal_res_get_fast(j, res, u64s_min, u64s_max))
|
||||
goto out;
|
||||
|
||||
ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
|
||||
if (ret)
|
||||
return ret;
|
||||
out:
|
||||
lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
|
||||
EBUG_ON(!res->ref);
|
||||
return 0;
|
||||
}
|
||||
|
||||
u64 bch2_journal_last_unwritten_seq(struct journal *);
|
||||
int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
|
||||
|
||||
void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
|
||||
void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
|
||||
void bch2_journal_flush_async(struct journal *, struct closure *);
|
||||
void bch2_journal_meta_async(struct journal *, struct closure *);
|
||||
|
||||
int bch2_journal_flush_seq(struct journal *, u64);
|
||||
int bch2_journal_flush(struct journal *);
|
||||
int bch2_journal_meta(struct journal *);
|
||||
|
||||
void bch2_journal_halt(struct journal *);
|
||||
|
||||
static inline int bch2_journal_error(struct journal *j)
|
||||
{
|
||||
return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
|
||||
? -EIO : 0;
|
||||
}
|
||||
|
||||
struct bch_dev;
|
||||
|
||||
static inline bool journal_flushes_device(struct bch_dev *ca)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
int bch2_journal_mark(struct bch_fs *, struct list_head *);
|
||||
void bch2_journal_entries_free(struct list_head *);
|
||||
int bch2_journal_replay(struct bch_fs *, struct list_head *);
|
||||
|
||||
static inline void bch2_journal_set_replay_done(struct journal *j)
|
||||
{
|
||||
BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
|
||||
set_bit(JOURNAL_REPLAY_DONE, &j->flags);
|
||||
}
|
||||
|
||||
ssize_t bch2_journal_print_debug(struct journal *, char *);
|
||||
ssize_t bch2_journal_print_pins(struct journal *, char *);
|
||||
|
||||
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
|
||||
unsigned nr);
|
||||
int bch2_dev_journal_alloc(struct bch_dev *);
|
||||
|
||||
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
|
||||
void bch2_fs_journal_stop(struct journal *);
|
||||
void bch2_fs_journal_start(struct journal *);
|
||||
void bch2_dev_journal_exit(struct bch_dev *);
|
||||
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
|
||||
void bch2_fs_journal_exit(struct journal *);
|
||||
int bch2_fs_journal_init(struct journal *);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_H */
|
1392
fs/bcachefs/journal_io.c
Normal file
1392
fs/bcachefs/journal_io.c
Normal file
File diff suppressed because it is too large
Load Diff
44
fs/bcachefs/journal_io.h
Normal file
44
fs/bcachefs/journal_io.h
Normal file
@ -0,0 +1,44 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_JOURNAL_IO_H
|
||||
#define _BCACHEFS_JOURNAL_IO_H
|
||||
|
||||
/*
|
||||
* Only used for holding the journal entries we read in btree_journal_read()
|
||||
* during cache_registration
|
||||
*/
|
||||
struct journal_replay {
|
||||
struct list_head list;
|
||||
struct bch_devs_list devs;
|
||||
/* must be last: */
|
||||
struct jset j;
|
||||
};
|
||||
|
||||
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
||||
struct jset_entry *entry, unsigned type)
|
||||
{
|
||||
while (entry < vstruct_last(jset)) {
|
||||
if (entry->type == type)
|
||||
return entry;
|
||||
|
||||
entry = vstruct_next(entry);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define for_each_jset_entry_type(entry, jset, type) \
|
||||
for (entry = (jset)->start; \
|
||||
(entry = __jset_entry_type_next(jset, entry, type)); \
|
||||
entry = vstruct_next(entry))
|
||||
|
||||
#define for_each_jset_key(k, _n, entry, jset) \
|
||||
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
|
||||
vstruct_for_each_safe(entry, k, _n)
|
||||
|
||||
int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
|
||||
int bch2_journal_read(struct bch_fs *, struct list_head *);
|
||||
|
||||
int bch2_journal_entry_sectors(struct journal *);
|
||||
void bch2_journal_write(struct closure *);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_IO_H */
|
402
fs/bcachefs/journal_reclaim.c
Normal file
402
fs/bcachefs/journal_reclaim.c
Normal file
@ -0,0 +1,402 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "journal.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
|
||||
/*
|
||||
* Journal entry pinning - machinery for holding a reference on a given journal
|
||||
* entry, holding it open to ensure it gets replayed during recovery:
|
||||
*/
|
||||
|
||||
static inline u64 journal_pin_seq(struct journal *j,
|
||||
struct journal_entry_pin_list *pin_list)
|
||||
{
|
||||
return fifo_entry_idx_abs(&j->pin, pin_list);
|
||||
}
|
||||
|
||||
u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
|
||||
{
|
||||
u64 ret = 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
if (journal_pin_active(pin))
|
||||
ret = journal_pin_seq(j, pin->pin_list);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void __journal_pin_add(struct journal *j,
|
||||
struct journal_entry_pin_list *pin_list,
|
||||
struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn flush_fn)
|
||||
{
|
||||
BUG_ON(journal_pin_active(pin));
|
||||
BUG_ON(!atomic_read(&pin_list->count));
|
||||
|
||||
atomic_inc(&pin_list->count);
|
||||
pin->pin_list = pin_list;
|
||||
pin->flush = flush_fn;
|
||||
|
||||
if (flush_fn)
|
||||
list_add(&pin->list, &pin_list->list);
|
||||
else
|
||||
INIT_LIST_HEAD(&pin->list);
|
||||
|
||||
/*
|
||||
* If the journal is currently full, we might want to call flush_fn
|
||||
* immediately:
|
||||
*/
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_add(struct journal *j, u64 seq,
|
||||
struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn flush_fn)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
static inline void __journal_pin_drop(struct journal *j,
|
||||
struct journal_entry_pin *pin)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list = pin->pin_list;
|
||||
|
||||
if (!journal_pin_active(pin))
|
||||
return;
|
||||
|
||||
pin->pin_list = NULL;
|
||||
list_del_init(&pin->list);
|
||||
|
||||
/*
|
||||
* Unpinning a journal entry make make journal_next_bucket() succeed, if
|
||||
* writing a new last_seq will now make another bucket available:
|
||||
*/
|
||||
if (atomic_dec_and_test(&pin_list->count) &&
|
||||
pin_list == &fifo_peek_front(&j->pin))
|
||||
bch2_journal_reclaim_fast(j);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_drop(struct journal *j,
|
||||
struct journal_entry_pin *pin)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
__journal_pin_drop(j, pin);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
void bch2_journal_pin_add_if_older(struct journal *j,
|
||||
struct journal_entry_pin *src_pin,
|
||||
struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn flush_fn)
|
||||
{
|
||||
spin_lock(&j->lock);
|
||||
|
||||
if (journal_pin_active(src_pin) &&
|
||||
(!journal_pin_active(pin) ||
|
||||
journal_pin_seq(j, src_pin->pin_list) <
|
||||
journal_pin_seq(j, pin->pin_list))) {
|
||||
__journal_pin_drop(j, pin);
|
||||
__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
|
||||
}
|
||||
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Journal reclaim: flush references to open journal entries to reclaim space in
|
||||
* the journal
|
||||
*
|
||||
* May be done by the journal code in the background as needed to free up space
|
||||
* for more journal entries, or as part of doing a clean shutdown, or to migrate
|
||||
* data off of a specific device:
|
||||
*/
|
||||
|
||||
/**
|
||||
* bch2_journal_reclaim_fast - do the fast part of journal reclaim
|
||||
*
|
||||
* Called from IO submission context, does not block. Cleans up after btree
|
||||
* write completions by advancing the journal pin and each cache's last_idx,
|
||||
* kicking off discards and background reclaim as necessary.
|
||||
*/
|
||||
void bch2_journal_reclaim_fast(struct journal *j)
|
||||
{
|
||||
struct journal_entry_pin_list temp;
|
||||
bool popped = false;
|
||||
|
||||
lockdep_assert_held(&j->lock);
|
||||
|
||||
/*
|
||||
* Unpin journal entries whose reference counts reached zero, meaning
|
||||
* all btree nodes got written out
|
||||
*/
|
||||
while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
|
||||
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
|
||||
BUG_ON(!fifo_pop(&j->pin, temp));
|
||||
popped = true;
|
||||
}
|
||||
|
||||
if (popped)
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
static struct journal_entry_pin *
|
||||
__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
|
||||
{
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
struct journal_entry_pin *ret;
|
||||
u64 iter;
|
||||
|
||||
/* no need to iterate over empty fifo entries: */
|
||||
bch2_journal_reclaim_fast(j);
|
||||
|
||||
fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
|
||||
if (iter > seq_to_flush)
|
||||
break;
|
||||
|
||||
ret = list_first_entry_or_null(&pin_list->list,
|
||||
struct journal_entry_pin, list);
|
||||
if (ret) {
|
||||
/* must be list_del_init(), see bch2_journal_pin_drop() */
|
||||
list_move(&ret->list, &pin_list->flushed);
|
||||
*seq = iter;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct journal_entry_pin *
|
||||
journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
|
||||
{
|
||||
struct journal_entry_pin *ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ret = __journal_get_next_pin(j, seq_to_flush, seq);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ret = ja->nr &&
|
||||
(ja->last_idx != ja->cur_idx &&
|
||||
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_journal_reclaim_work - free up journal buckets
|
||||
*
|
||||
* Background journal reclaim writes out btree nodes. It should be run
|
||||
* early enough so that we never completely run out of journal buckets.
|
||||
*
|
||||
* High watermarks for triggering background reclaim:
|
||||
* - FIFO has fewer than 512 entries left
|
||||
* - fewer than 25% journal buckets free
|
||||
*
|
||||
* Background reclaim runs until low watermarks are reached:
|
||||
* - FIFO has more than 1024 entries left
|
||||
* - more than 50% journal buckets free
|
||||
*
|
||||
* As long as a reclaim can complete in the time it takes to fill up
|
||||
* 512 journal entries or 25% of all journal buckets, then
|
||||
* journal_next_bucket() should not stall.
|
||||
*/
|
||||
void bch2_journal_reclaim_work(struct work_struct *work)
|
||||
{
|
||||
struct bch_fs *c = container_of(to_delayed_work(work),
|
||||
struct bch_fs, journal.reclaim_work);
|
||||
struct journal *j = &c->journal;
|
||||
struct bch_dev *ca;
|
||||
struct journal_entry_pin *pin;
|
||||
u64 seq, seq_to_flush = 0;
|
||||
unsigned iter, bucket_to_flush;
|
||||
unsigned long next_flush;
|
||||
bool reclaim_lock_held = false, need_flush;
|
||||
|
||||
/*
|
||||
* Advance last_idx to point to the oldest journal entry containing
|
||||
* btree node updates that have not yet been written out
|
||||
*/
|
||||
for_each_rw_member(ca, c, iter) {
|
||||
struct journal_device *ja = &ca->journal;
|
||||
|
||||
if (!ja->nr)
|
||||
continue;
|
||||
|
||||
while (should_discard_bucket(j, ja)) {
|
||||
if (!reclaim_lock_held) {
|
||||
/*
|
||||
* ugh:
|
||||
* might be called from __journal_res_get()
|
||||
* under wait_event() - have to go back to
|
||||
* TASK_RUNNING before doing something that
|
||||
* would block, but only if we're doing work:
|
||||
*/
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
reclaim_lock_held = true;
|
||||
/* recheck under reclaim_lock: */
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ca->mi.discard &&
|
||||
bdev_max_discard_sectors(ca->disk_sb.bdev))
|
||||
blkdev_issue_discard(ca->disk_sb.bdev,
|
||||
bucket_to_sector(ca,
|
||||
ja->buckets[ja->last_idx]),
|
||||
ca->mi.bucket_size, GFP_NOIO);
|
||||
|
||||
spin_lock(&j->lock);
|
||||
ja->last_idx = (ja->last_idx + 1) % ja->nr;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
journal_wake(j);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out enough btree nodes to free up 50% journal
|
||||
* buckets
|
||||
*/
|
||||
spin_lock(&j->lock);
|
||||
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
|
||||
seq_to_flush = max_t(u64, seq_to_flush,
|
||||
ja->bucket_seq[bucket_to_flush]);
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
if (reclaim_lock_held)
|
||||
mutex_unlock(&j->reclaim_lock);
|
||||
|
||||
/* Also flush if the pin fifo is more than half full */
|
||||
spin_lock(&j->lock);
|
||||
seq_to_flush = max_t(s64, seq_to_flush,
|
||||
(s64) journal_cur_seq(j) -
|
||||
(j->pin.size >> 1));
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
/*
|
||||
* If it's been longer than j->reclaim_delay_ms since we last flushed,
|
||||
* make sure to flush at least one journal pin:
|
||||
*/
|
||||
next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
|
||||
need_flush = time_after(jiffies, next_flush);
|
||||
|
||||
while ((pin = journal_get_next_pin(j, need_flush
|
||||
? U64_MAX
|
||||
: seq_to_flush, &seq))) {
|
||||
__set_current_state(TASK_RUNNING);
|
||||
pin->flush(j, pin, seq);
|
||||
need_flush = false;
|
||||
|
||||
j->last_flushed = jiffies;
|
||||
}
|
||||
|
||||
if (!test_bit(BCH_FS_RO, &c->flags))
|
||||
queue_delayed_work(system_freezable_wq, &j->reclaim_work,
|
||||
msecs_to_jiffies(j->reclaim_delay_ms));
|
||||
}
|
||||
|
||||
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
||||
struct journal_entry_pin **pin,
|
||||
u64 *pin_seq)
|
||||
{
|
||||
int ret;
|
||||
|
||||
*pin = NULL;
|
||||
|
||||
ret = bch2_journal_error(j);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
/*
|
||||
* If journal replay hasn't completed, the unreplayed journal entries
|
||||
* hold refs on their corresponding sequence numbers
|
||||
*/
|
||||
ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
|
||||
!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
|
||||
journal_last_seq(j) > seq_to_flush ||
|
||||
(fifo_used(&j->pin) == 1 &&
|
||||
atomic_read(&fifo_peek_front(&j->pin).count) == 1);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
||||
{
|
||||
struct journal_entry_pin *pin;
|
||||
u64 pin_seq;
|
||||
|
||||
if (!test_bit(JOURNAL_STARTED, &j->flags))
|
||||
return;
|
||||
|
||||
while (1) {
|
||||
wait_event(j->wait, journal_flush_done(j, seq_to_flush,
|
||||
&pin, &pin_seq));
|
||||
if (!pin)
|
||||
break;
|
||||
|
||||
pin->flush(j, pin, pin_seq);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_entry_pin_list *p;
|
||||
struct bch_devs_list devs;
|
||||
u64 iter, seq = 0;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
fifo_for_each_entry_ptr(p, &j->pin, iter)
|
||||
if (dev_idx >= 0
|
||||
? bch2_dev_list_has_dev(p->devs, dev_idx)
|
||||
: p->devs.nr < c->opts.metadata_replicas)
|
||||
seq = iter;
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
bch2_journal_flush_pins(j, seq);
|
||||
|
||||
ret = bch2_journal_error(j);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
|
||||
|
||||
seq = 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
while (!ret && seq < j->pin.back) {
|
||||
seq = max(seq, journal_last_seq(j));
|
||||
devs = journal_seq_pin(j, seq)->devs;
|
||||
seq++;
|
||||
|
||||
spin_unlock(&j->lock);
|
||||
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
|
||||
spin_lock(&j->lock);
|
||||
}
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
ret = bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
42
fs/bcachefs/journal_reclaim.h
Normal file
42
fs/bcachefs/journal_reclaim.h
Normal file
@ -0,0 +1,42 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
|
||||
#define _BCACHEFS_JOURNAL_RECLAIM_H
|
||||
|
||||
#define JOURNAL_PIN (32 * 1024)
|
||||
|
||||
static inline bool journal_pin_active(struct journal_entry_pin *pin)
|
||||
{
|
||||
return pin->pin_list != NULL;
|
||||
}
|
||||
|
||||
static inline struct journal_entry_pin_list *
|
||||
journal_seq_pin(struct journal *j, u64 seq)
|
||||
{
|
||||
BUG_ON(seq < j->pin.front || seq >= j->pin.back);
|
||||
|
||||
return &j->pin.data[seq & j->pin.mask];
|
||||
}
|
||||
|
||||
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
|
||||
|
||||
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
|
||||
journal_pin_flush_fn);
|
||||
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
|
||||
void bch2_journal_pin_add_if_older(struct journal *,
|
||||
struct journal_entry_pin *,
|
||||
struct journal_entry_pin *,
|
||||
journal_pin_flush_fn);
|
||||
|
||||
void bch2_journal_reclaim_fast(struct journal *);
|
||||
void bch2_journal_reclaim_work(struct work_struct *);
|
||||
|
||||
void bch2_journal_flush_pins(struct journal *, u64);
|
||||
|
||||
static inline void bch2_journal_flush_all_pins(struct journal *j)
|
||||
{
|
||||
bch2_journal_flush_pins(j, U64_MAX);
|
||||
}
|
||||
|
||||
int bch2_journal_flush_device_pins(struct journal *, int);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
|
360
fs/bcachefs/journal_seq_blacklist.c
Normal file
360
fs/bcachefs/journal_seq_blacklist.c
Normal file
@ -0,0 +1,360 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "error.h"
|
||||
#include "journal.h"
|
||||
#include "journal_io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "journal_seq_blacklist.h"
|
||||
|
||||
/*
|
||||
* journal_seq_blacklist machinery:
|
||||
*
|
||||
* To guarantee order of btree updates after a crash, we need to detect when a
|
||||
* btree node entry (bset) is newer than the newest journal entry that was
|
||||
* successfully written, and ignore it - effectively ignoring any btree updates
|
||||
* that didn't make it into the journal.
|
||||
*
|
||||
* If we didn't do this, we might have two btree nodes, a and b, both with
|
||||
* updates that weren't written to the journal yet: if b was updated after a,
|
||||
* but b was flushed and not a - oops; on recovery we'll find that the updates
|
||||
* to b happened, but not the updates to a that happened before it.
|
||||
*
|
||||
* Ignoring bsets that are newer than the newest journal entry is always safe,
|
||||
* because everything they contain will also have been journalled - and must
|
||||
* still be present in the journal on disk until a journal entry has been
|
||||
* written _after_ that bset was written.
|
||||
*
|
||||
* To accomplish this, bsets record the newest journal sequence number they
|
||||
* contain updates for; then, on startup, the btree code queries the journal
|
||||
* code to ask "Is this sequence number newer than the newest journal entry? If
|
||||
* so, ignore it."
|
||||
*
|
||||
* When this happens, we must blacklist that journal sequence number: the
|
||||
* journal must not write any entries with that sequence number, and it must
|
||||
* record that it was blacklisted so that a) on recovery we don't think we have
|
||||
* missing journal entries and b) so that the btree code continues to ignore
|
||||
* that bset, until that btree node is rewritten.
|
||||
*
|
||||
* Blacklisted journal sequence numbers are themselves recorded as entries in
|
||||
* the journal.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Called when journal needs to evict a blacklist entry to reclaim space: find
|
||||
* any btree nodes that refer to the blacklist journal sequence numbers, and
|
||||
* rewrite them:
|
||||
*/
|
||||
static void journal_seq_blacklist_flush(struct journal *j,
|
||||
struct journal_entry_pin *pin, u64 seq)
|
||||
{
|
||||
struct bch_fs *c =
|
||||
container_of(j, struct bch_fs, journal);
|
||||
struct journal_seq_blacklist *bl =
|
||||
container_of(pin, struct journal_seq_blacklist, pin);
|
||||
struct blacklisted_node n;
|
||||
struct closure cl;
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
for (i = 0;; i++) {
|
||||
struct btree_iter iter;
|
||||
struct btree *b;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
if (i >= bl->nr_entries) {
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
break;
|
||||
}
|
||||
n = bl->entries[i];
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
|
||||
__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
|
||||
0, 0, BTREE_ITER_NODES);
|
||||
|
||||
b = bch2_btree_iter_peek_node(&iter);
|
||||
|
||||
/* The node might have already been rewritten: */
|
||||
|
||||
if (b->data->keys.seq == n.seq) {
|
||||
ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
|
||||
if (ret) {
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
bch2_fs_fatal_error(c,
|
||||
"error %i rewriting btree node with blacklisted journal seq",
|
||||
ret);
|
||||
bch2_journal_halt(j);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
}
|
||||
|
||||
for (i = 0;; i++) {
|
||||
struct btree_update *as;
|
||||
struct pending_btree_node_free *d;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
if (i >= bl->nr_entries) {
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
break;
|
||||
}
|
||||
n = bl->entries[i];
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
redo_wait:
|
||||
mutex_lock(&c->btree_interior_update_lock);
|
||||
|
||||
/*
|
||||
* Is the node on the list of pending interior node updates -
|
||||
* being freed? If so, wait for that to finish:
|
||||
*/
|
||||
for_each_pending_btree_node_free(c, as, d)
|
||||
if (n.seq == d->seq &&
|
||||
n.btree_id == d->btree_id &&
|
||||
!d->level &&
|
||||
!bkey_cmp(n.pos, d->key.k.p)) {
|
||||
closure_wait(&as->wait, &cl);
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
closure_sync(&cl);
|
||||
goto redo_wait;
|
||||
}
|
||||
|
||||
mutex_unlock(&c->btree_interior_update_lock);
|
||||
}
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
|
||||
bch2_journal_pin_drop(j, &bl->pin);
|
||||
list_del(&bl->list);
|
||||
kfree(bl->entries);
|
||||
kfree(bl);
|
||||
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine if a particular sequence number is blacklisted - if so, return
|
||||
* blacklist entry:
|
||||
*/
|
||||
struct journal_seq_blacklist *
|
||||
bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
|
||||
{
|
||||
struct journal_seq_blacklist *bl;
|
||||
|
||||
lockdep_assert_held(&j->blacklist_lock);
|
||||
|
||||
list_for_each_entry(bl, &j->seq_blacklist, list)
|
||||
if (seq >= bl->start && seq <= bl->end)
|
||||
return bl;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new, in memory blacklist entry:
|
||||
*/
|
||||
static struct journal_seq_blacklist *
|
||||
bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
|
||||
{
|
||||
struct journal_seq_blacklist *bl;
|
||||
|
||||
lockdep_assert_held(&j->blacklist_lock);
|
||||
|
||||
/*
|
||||
* When we start the journal, bch2_journal_start() will skip over @seq:
|
||||
*/
|
||||
|
||||
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
|
||||
if (!bl)
|
||||
return NULL;
|
||||
|
||||
bl->start = start;
|
||||
bl->end = end;
|
||||
|
||||
list_add_tail(&bl->list, &j->seq_blacklist);
|
||||
return bl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if @seq is newer than the most recent journal entry that got
|
||||
* written, and data corresponding to @seq should be ignored - also marks @seq
|
||||
* as blacklisted so that on future restarts the corresponding data will still
|
||||
* be ignored:
|
||||
*/
|
||||
int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
|
||||
{
|
||||
struct journal *j = &c->journal;
|
||||
struct journal_seq_blacklist *bl = NULL;
|
||||
struct blacklisted_node *n;
|
||||
u64 journal_seq;
|
||||
int ret = 0;
|
||||
|
||||
if (!seq)
|
||||
return 0;
|
||||
|
||||
spin_lock(&j->lock);
|
||||
journal_seq = journal_cur_seq(j);
|
||||
spin_unlock(&j->lock);
|
||||
|
||||
/* Interier updates aren't journalled: */
|
||||
BUG_ON(b->level);
|
||||
BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
|
||||
|
||||
/*
|
||||
* Decrease this back to j->seq + 2 when we next rev the on disk format:
|
||||
* increasing it temporarily to work around bug in old kernels
|
||||
*/
|
||||
fsck_err_on(seq > journal_seq + 4, c,
|
||||
"bset journal seq too far in the future: %llu > %llu",
|
||||
seq, journal_seq);
|
||||
|
||||
if (seq <= journal_seq &&
|
||||
list_empty_careful(&j->seq_blacklist))
|
||||
return 0;
|
||||
|
||||
mutex_lock(&j->blacklist_lock);
|
||||
|
||||
if (seq <= journal_seq) {
|
||||
bl = bch2_journal_seq_blacklist_find(j, seq);
|
||||
if (!bl)
|
||||
goto out;
|
||||
} else {
|
||||
bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
|
||||
b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
|
||||
|
||||
if (!j->new_blacklist) {
|
||||
j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
|
||||
journal_seq + 1,
|
||||
journal_seq + 1);
|
||||
if (!j->new_blacklist) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
bl = j->new_blacklist;
|
||||
bl->end = max(bl->end, seq);
|
||||
}
|
||||
|
||||
for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
|
||||
if (b->data->keys.seq == n->seq &&
|
||||
b->btree_id == n->btree_id &&
|
||||
!bkey_cmp(b->key.k.p, n->pos))
|
||||
goto found_entry;
|
||||
|
||||
if (!bl->nr_entries ||
|
||||
is_power_of_2(bl->nr_entries)) {
|
||||
n = krealloc(bl->entries,
|
||||
max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
|
||||
GFP_KERNEL);
|
||||
if (!n) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
bl->entries = n;
|
||||
}
|
||||
|
||||
bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
|
||||
.seq = b->data->keys.seq,
|
||||
.btree_id = b->btree_id,
|
||||
.pos = b->key.k.p,
|
||||
};
|
||||
found_entry:
|
||||
ret = 1;
|
||||
out:
|
||||
fsck_err:
|
||||
mutex_unlock(&j->blacklist_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __bch2_journal_seq_blacklist_read(struct journal *j,
|
||||
struct journal_replay *i,
|
||||
u64 start, u64 end)
|
||||
{
|
||||
struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
||||
struct journal_seq_blacklist *bl;
|
||||
|
||||
bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
|
||||
start, end);
|
||||
|
||||
bl = bch2_journal_seq_blacklisted_new(j, start, end);
|
||||
if (!bl)
|
||||
return -ENOMEM;
|
||||
|
||||
bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
|
||||
journal_seq_blacklist_flush);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* After reading the journal, find existing journal seq blacklist entries and
|
||||
* read them into memory:
|
||||
*/
|
||||
int bch2_journal_seq_blacklist_read(struct journal *j,
|
||||
struct journal_replay *i)
|
||||
{
|
||||
struct jset_entry *entry;
|
||||
int ret = 0;
|
||||
|
||||
vstruct_for_each(&i->j, entry) {
|
||||
switch (entry->type) {
|
||||
case BCH_JSET_ENTRY_blacklist: {
|
||||
struct jset_entry_blacklist *bl_entry =
|
||||
container_of(entry, struct jset_entry_blacklist, entry);
|
||||
|
||||
ret = __bch2_journal_seq_blacklist_read(j, i,
|
||||
le64_to_cpu(bl_entry->seq),
|
||||
le64_to_cpu(bl_entry->seq));
|
||||
break;
|
||||
}
|
||||
case BCH_JSET_ENTRY_blacklist_v2: {
|
||||
struct jset_entry_blacklist_v2 *bl_entry =
|
||||
container_of(entry, struct jset_entry_blacklist_v2, entry);
|
||||
|
||||
ret = __bch2_journal_seq_blacklist_read(j, i,
|
||||
le64_to_cpu(bl_entry->start),
|
||||
le64_to_cpu(bl_entry->end));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* After reading the journal and walking the btree, we might have new journal
|
||||
* sequence numbers to blacklist - add entries to the next journal entry to be
|
||||
* written:
|
||||
*/
|
||||
void bch2_journal_seq_blacklist_write(struct journal *j)
|
||||
{
|
||||
struct journal_seq_blacklist *bl = j->new_blacklist;
|
||||
struct jset_entry_blacklist_v2 *bl_entry;
|
||||
struct jset_entry *entry;
|
||||
|
||||
if (!bl)
|
||||
return;
|
||||
|
||||
entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
|
||||
(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
|
||||
|
||||
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
|
||||
bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
|
||||
bl_entry->start = cpu_to_le64(bl->start);
|
||||
bl_entry->end = cpu_to_le64(bl->end);
|
||||
|
||||
bch2_journal_pin_add(j,
|
||||
journal_cur_seq(j),
|
||||
&bl->pin,
|
||||
journal_seq_blacklist_flush);
|
||||
|
||||
j->new_blacklist = NULL;
|
||||
}
|
14
fs/bcachefs/journal_seq_blacklist.h
Normal file
14
fs/bcachefs/journal_seq_blacklist.h
Normal file
@ -0,0 +1,14 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
|
||||
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
|
||||
|
||||
struct journal_replay;
|
||||
|
||||
struct journal_seq_blacklist *
|
||||
bch2_journal_seq_blacklist_find(struct journal *, u64);
|
||||
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
|
||||
int bch2_journal_seq_blacklist_read(struct journal *,
|
||||
struct journal_replay *);
|
||||
void bch2_journal_seq_blacklist_write(struct journal *);
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
|
242
fs/bcachefs/journal_types.h
Normal file
242
fs/bcachefs/journal_types.h
Normal file
@ -0,0 +1,242 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_JOURNAL_TYPES_H
|
||||
#define _BCACHEFS_JOURNAL_TYPES_H
|
||||
|
||||
#include <linux/cache.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "alloc_types.h"
|
||||
#include "super_types.h"
|
||||
#include "fifo.h"
|
||||
|
||||
struct journal_res;
|
||||
|
||||
/*
|
||||
* We put two of these in struct journal; we used them for writes to the
|
||||
* journal that are being staged or in flight.
|
||||
*/
|
||||
struct journal_buf {
|
||||
struct jset *data;
|
||||
|
||||
BKEY_PADDED(key);
|
||||
|
||||
struct closure_waitlist wait;
|
||||
|
||||
unsigned size;
|
||||
unsigned disk_sectors;
|
||||
/* bloom filter: */
|
||||
unsigned long has_inode[1024 / sizeof(unsigned long)];
|
||||
};
|
||||
|
||||
/*
|
||||
* Something that makes a journal entry dirty - i.e. a btree node that has to be
|
||||
* flushed:
|
||||
*/
|
||||
|
||||
struct journal_entry_pin_list {
|
||||
struct list_head list;
|
||||
struct list_head flushed;
|
||||
atomic_t count;
|
||||
struct bch_devs_list devs;
|
||||
};
|
||||
|
||||
struct journal;
|
||||
struct journal_entry_pin;
|
||||
typedef void (*journal_pin_flush_fn)(struct journal *j,
|
||||
struct journal_entry_pin *, u64);
|
||||
|
||||
struct journal_entry_pin {
|
||||
struct list_head list;
|
||||
journal_pin_flush_fn flush;
|
||||
struct journal_entry_pin_list *pin_list;
|
||||
};
|
||||
|
||||
/* corresponds to a btree node with a blacklisted bset: */
|
||||
struct blacklisted_node {
|
||||
__le64 seq;
|
||||
enum btree_id btree_id;
|
||||
struct bpos pos;
|
||||
};
|
||||
|
||||
struct journal_seq_blacklist {
|
||||
struct list_head list;
|
||||
u64 start;
|
||||
u64 end;
|
||||
|
||||
struct journal_entry_pin pin;
|
||||
|
||||
struct blacklisted_node *entries;
|
||||
size_t nr_entries;
|
||||
};
|
||||
|
||||
struct journal_res {
|
||||
bool ref;
|
||||
u8 idx;
|
||||
u16 u64s;
|
||||
u32 offset;
|
||||
u64 seq;
|
||||
};
|
||||
|
||||
union journal_res_state {
|
||||
struct {
|
||||
atomic64_t counter;
|
||||
};
|
||||
|
||||
struct {
|
||||
u64 v;
|
||||
};
|
||||
|
||||
struct {
|
||||
u64 cur_entry_offset:20,
|
||||
idx:1,
|
||||
prev_buf_unwritten:1,
|
||||
buf0_count:21,
|
||||
buf1_count:21;
|
||||
};
|
||||
};
|
||||
|
||||
/* bytes: */
|
||||
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
|
||||
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
|
||||
|
||||
/*
|
||||
* We stash some journal state as sentinal values in cur_entry_offset:
|
||||
* note - cur_entry_offset is in units of u64s
|
||||
*/
|
||||
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
|
||||
|
||||
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
|
||||
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
|
||||
|
||||
/*
|
||||
* JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
|
||||
* either because something's waiting on the write to complete or because it's
|
||||
* been dirty too long and the timer's expired.
|
||||
*/
|
||||
|
||||
enum {
|
||||
JOURNAL_REPLAY_DONE,
|
||||
JOURNAL_STARTED,
|
||||
JOURNAL_NEED_WRITE,
|
||||
JOURNAL_NOT_EMPTY,
|
||||
};
|
||||
|
||||
/* Embedded in struct bch_fs */
|
||||
struct journal {
|
||||
/* Fastpath stuff up front: */
|
||||
|
||||
unsigned long flags;
|
||||
|
||||
union journal_res_state reservations;
|
||||
unsigned cur_entry_u64s;
|
||||
unsigned prev_buf_sectors;
|
||||
unsigned cur_buf_sectors;
|
||||
unsigned buf_size_want;
|
||||
|
||||
/*
|
||||
* Two journal entries -- one is currently open for new entries, the
|
||||
* other is possibly being written out.
|
||||
*/
|
||||
struct journal_buf buf[2];
|
||||
|
||||
spinlock_t lock;
|
||||
|
||||
/* Used when waiting because the journal was full */
|
||||
wait_queue_head_t wait;
|
||||
struct closure_waitlist async_wait;
|
||||
|
||||
struct closure io;
|
||||
struct delayed_work write_work;
|
||||
|
||||
/* Sequence number of most recent journal entry (last entry in @pin) */
|
||||
atomic64_t seq;
|
||||
|
||||
/* last_seq from the most recent journal entry written */
|
||||
u64 last_seq_ondisk;
|
||||
|
||||
/*
|
||||
* FIFO of journal entries whose btree updates have not yet been
|
||||
* written out.
|
||||
*
|
||||
* Each entry is a reference count. The position in the FIFO is the
|
||||
* entry's sequence number relative to @seq.
|
||||
*
|
||||
* The journal entry itself holds a reference count, put when the
|
||||
* journal entry is written out. Each btree node modified by the journal
|
||||
* entry also holds a reference count, put when the btree node is
|
||||
* written.
|
||||
*
|
||||
* When a reference count reaches zero, the journal entry is no longer
|
||||
* needed. When all journal entries in the oldest journal bucket are no
|
||||
* longer needed, the bucket can be discarded and reused.
|
||||
*/
|
||||
struct {
|
||||
u64 front, back, size, mask;
|
||||
struct journal_entry_pin_list *data;
|
||||
} pin;
|
||||
u64 replay_journal_seq;
|
||||
|
||||
struct mutex blacklist_lock;
|
||||
struct list_head seq_blacklist;
|
||||
struct journal_seq_blacklist *new_blacklist;
|
||||
|
||||
BKEY_PADDED(key);
|
||||
struct write_point wp;
|
||||
spinlock_t err_lock;
|
||||
|
||||
struct delayed_work reclaim_work;
|
||||
unsigned long last_flushed;
|
||||
|
||||
/* protects advancing ja->last_idx: */
|
||||
struct mutex reclaim_lock;
|
||||
unsigned write_delay_ms;
|
||||
unsigned reclaim_delay_ms;
|
||||
|
||||
u64 res_get_blocked_start;
|
||||
u64 need_write_time;
|
||||
u64 write_start_time;
|
||||
|
||||
struct bch2_time_stats *write_time;
|
||||
struct bch2_time_stats *delay_time;
|
||||
struct bch2_time_stats *blocked_time;
|
||||
struct bch2_time_stats *flush_seq_time;
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lockdep_map res_map;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Embedded in struct bch_dev. First three fields refer to the array of journal
|
||||
* buckets, in bch_sb.
|
||||
*/
|
||||
struct journal_device {
|
||||
/*
|
||||
* For each journal bucket, contains the max sequence number of the
|
||||
* journal writes it contains - so we know when a bucket can be reused.
|
||||
*/
|
||||
u64 *bucket_seq;
|
||||
|
||||
unsigned sectors_free;
|
||||
|
||||
/* Journal bucket we're currently writing to */
|
||||
unsigned cur_idx;
|
||||
|
||||
/* Last journal bucket that still contains an open journal entry */
|
||||
|
||||
/*
|
||||
* j->lock and j->reclaim_lock must both be held to modify, j->lock
|
||||
* sufficient to read:
|
||||
*/
|
||||
unsigned last_idx;
|
||||
unsigned nr;
|
||||
u64 *buckets;
|
||||
|
||||
/* Bio for journal reads/writes to this device */
|
||||
struct bio *bio;
|
||||
|
||||
/* for bch_journal_read_device */
|
||||
struct closure read;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_JOURNAL_TYPES_H */
|
67
fs/bcachefs/keylist.c
Normal file
67
fs/bcachefs/keylist.c
Normal file
@ -0,0 +1,67 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "keylist.h"
|
||||
|
||||
int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
|
||||
size_t nr_inline_u64s, size_t new_u64s)
|
||||
{
|
||||
size_t oldsize = bch_keylist_u64s(l);
|
||||
size_t newsize = oldsize + new_u64s;
|
||||
u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
|
||||
u64 *new_keys;
|
||||
|
||||
newsize = roundup_pow_of_two(newsize);
|
||||
|
||||
if (newsize <= nr_inline_u64s ||
|
||||
(old_buf && roundup_pow_of_two(oldsize) == newsize))
|
||||
return 0;
|
||||
|
||||
new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
|
||||
if (!new_keys)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!old_buf)
|
||||
memcpy_u64s(new_keys, inline_u64s, oldsize);
|
||||
|
||||
l->keys_p = new_keys;
|
||||
l->top_p = new_keys + oldsize;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
|
||||
{
|
||||
struct bkey_i *where;
|
||||
|
||||
for_each_keylist_key(l, where)
|
||||
if (bkey_cmp(insert->k.p, where->k.p) < 0)
|
||||
break;
|
||||
|
||||
memmove_u64s_up((u64 *) where + insert->k.u64s,
|
||||
where,
|
||||
((u64 *) l->top) - ((u64 *) where));
|
||||
|
||||
l->top_p += insert->k.u64s;
|
||||
bkey_copy(where, insert);
|
||||
}
|
||||
|
||||
void bch2_keylist_pop_front(struct keylist *l)
|
||||
{
|
||||
l->top_p -= bch2_keylist_front(l)->k.u64s;
|
||||
|
||||
memmove_u64s_down(l->keys,
|
||||
bkey_next(l->keys),
|
||||
bch_keylist_u64s(l));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_verify_keylist_sorted(struct keylist *l)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
||||
for_each_keylist_key(l, k)
|
||||
BUG_ON(bkey_next(k) != l->top &&
|
||||
bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
|
||||
}
|
||||
#endif
|
76
fs/bcachefs/keylist.h
Normal file
76
fs/bcachefs/keylist.h
Normal file
@ -0,0 +1,76 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_KEYLIST_H
|
||||
#define _BCACHEFS_KEYLIST_H
|
||||
|
||||
#include "keylist_types.h"
|
||||
|
||||
int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
|
||||
void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
|
||||
void bch2_keylist_pop_front(struct keylist *);
|
||||
|
||||
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
|
||||
{
|
||||
l->top_p = l->keys_p = inline_keys;
|
||||
}
|
||||
|
||||
static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
|
||||
{
|
||||
if (l->keys_p != inline_keys)
|
||||
kfree(l->keys_p);
|
||||
bch2_keylist_init(l, inline_keys);
|
||||
}
|
||||
|
||||
static inline void bch2_keylist_push(struct keylist *l)
|
||||
{
|
||||
l->top = bkey_next(l->top);
|
||||
}
|
||||
|
||||
static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
|
||||
{
|
||||
bkey_copy(l->top, k);
|
||||
bch2_keylist_push(l);
|
||||
}
|
||||
|
||||
static inline bool bch2_keylist_empty(struct keylist *l)
|
||||
{
|
||||
return l->top == l->keys;
|
||||
}
|
||||
|
||||
static inline size_t bch_keylist_u64s(struct keylist *l)
|
||||
{
|
||||
return l->top_p - l->keys_p;
|
||||
}
|
||||
|
||||
static inline size_t bch2_keylist_bytes(struct keylist *l)
|
||||
{
|
||||
return bch_keylist_u64s(l) * sizeof(u64);
|
||||
}
|
||||
|
||||
static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
|
||||
{
|
||||
return l->keys;
|
||||
}
|
||||
|
||||
#define for_each_keylist_key(_keylist, _k) \
|
||||
for (_k = (_keylist)->keys; \
|
||||
_k != (_keylist)->top; \
|
||||
_k = bkey_next(_k))
|
||||
|
||||
static inline u64 keylist_sectors(struct keylist *keys)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
u64 ret = 0;
|
||||
|
||||
for_each_keylist_key(keys, k)
|
||||
ret += k->k.size;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
void bch2_verify_keylist_sorted(struct keylist *);
|
||||
#else
|
||||
static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_KEYLIST_H */
|
16
fs/bcachefs/keylist_types.h
Normal file
16
fs/bcachefs/keylist_types.h
Normal file
@ -0,0 +1,16 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_KEYLIST_TYPES_H
|
||||
#define _BCACHEFS_KEYLIST_TYPES_H
|
||||
|
||||
struct keylist {
|
||||
union {
|
||||
struct bkey_i *keys;
|
||||
u64 *keys_p;
|
||||
};
|
||||
union {
|
||||
struct bkey_i *top;
|
||||
u64 *top_p;
|
||||
};
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_KEYLIST_TYPES_H */
|
178
fs/bcachefs/migrate.c
Normal file
178
fs/bcachefs/migrate.c
Normal file
@ -0,0 +1,178 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Code for moving data off a device.
|
||||
*/
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "buckets.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "journal.h"
|
||||
#include "keylist.h"
|
||||
#include "migrate.h"
|
||||
#include "move.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
|
||||
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
|
||||
unsigned dev_idx, int flags, bool metadata)
|
||||
{
|
||||
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
|
||||
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
|
||||
unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
|
||||
unsigned nr_good;
|
||||
|
||||
bch2_extent_drop_device(e, dev_idx);
|
||||
|
||||
nr_good = bch2_extent_durability(c, e.c);
|
||||
if ((!nr_good && !(flags & lost)) ||
|
||||
(nr_good < replicas && !(flags & degraded)))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
struct bkey_s_extent e;
|
||||
BKEY_PADDED(key) tmp;
|
||||
struct btree_iter iter;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
|
||||
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
|
||||
POS_MIN, BTREE_ITER_PREFETCH);
|
||||
|
||||
while ((k = bch2_btree_iter_peek(&iter)).k &&
|
||||
!(ret = btree_iter_err(k))) {
|
||||
if (!bkey_extent_is_data(k.k) ||
|
||||
!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
|
||||
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
|
||||
if (ret)
|
||||
break;
|
||||
bch2_btree_iter_next(&iter);
|
||||
continue;
|
||||
}
|
||||
|
||||
bkey_reassemble(&tmp.key, k);
|
||||
e = bkey_i_to_s_extent(&tmp.key);
|
||||
|
||||
ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the new extent no longer has any pointers, bch2_extent_normalize()
|
||||
* will do the appropriate thing with it (turning it into a
|
||||
* KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
|
||||
*/
|
||||
bch2_extent_normalize(c, e.s);
|
||||
|
||||
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
|
||||
bkey_i_to_s_c(&tmp.key));
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
iter.pos = bkey_start_pos(&tmp.key.k);
|
||||
|
||||
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL,
|
||||
BTREE_INSERT_ENTRY(&iter, &tmp.key));
|
||||
|
||||
/*
|
||||
* don't want to leave ret == -EINTR, since if we raced and
|
||||
* something else overwrote the key we could spuriously return
|
||||
* -EINTR below:
|
||||
*/
|
||||
if (ret == -EINTR)
|
||||
ret = 0;
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct closure cl;
|
||||
struct btree *b;
|
||||
unsigned id;
|
||||
int ret;
|
||||
|
||||
/* don't handle this yet: */
|
||||
if (flags & BCH_FORCE_IF_METADATA_LOST)
|
||||
return -EINVAL;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
|
||||
|
||||
for (id = 0; id < BTREE_ID_NR; id++) {
|
||||
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
|
||||
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
|
||||
struct bkey_i_extent *new_key;
|
||||
retry:
|
||||
if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
|
||||
dev_idx)) {
|
||||
/*
|
||||
* we might have found a btree node key we
|
||||
* needed to update, and then tried to update it
|
||||
* but got -EINTR after upgrading the iter, but
|
||||
* then raced and the node is now gone:
|
||||
*/
|
||||
bch2_btree_iter_downgrade(&iter);
|
||||
|
||||
ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
|
||||
bkey_i_to_s_c(&b->key));
|
||||
if (ret)
|
||||
goto err;
|
||||
} else {
|
||||
bkey_copy(&tmp.k, &b->key);
|
||||
new_key = bkey_i_to_extent(&tmp.k);
|
||||
|
||||
ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
|
||||
dev_idx, flags, true);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
|
||||
if (ret == -EINTR) {
|
||||
b = bch2_btree_iter_peek_node(&iter);
|
||||
goto retry;
|
||||
}
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
ret = bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
err:
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
||||
{
|
||||
return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
|
||||
bch2_dev_metadata_drop(c, dev_idx, flags);
|
||||
}
|
7
fs/bcachefs/migrate.h
Normal file
7
fs/bcachefs/migrate.h
Normal file
@ -0,0 +1,7 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_MIGRATE_H
|
||||
#define _BCACHEFS_MIGRATE_H
|
||||
|
||||
int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
|
||||
|
||||
#endif /* _BCACHEFS_MIGRATE_H */
|
761
fs/bcachefs/move.c
Normal file
761
fs/bcachefs/move.c
Normal file
@ -0,0 +1,761 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_update.h"
|
||||
#include "buckets.h"
|
||||
#include "inode.h"
|
||||
#include "io.h"
|
||||
#include "journal_reclaim.h"
|
||||
#include "keylist.h"
|
||||
#include "move.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/ioprio.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
|
||||
|
||||
struct moving_io {
|
||||
struct list_head list;
|
||||
struct closure cl;
|
||||
bool read_completed;
|
||||
|
||||
unsigned read_sectors;
|
||||
unsigned write_sectors;
|
||||
|
||||
struct bch_read_bio rbio;
|
||||
|
||||
struct migrate_write write;
|
||||
/* Must be last since it is variable size */
|
||||
struct bio_vec bi_inline_vecs[0];
|
||||
};
|
||||
|
||||
struct moving_context {
|
||||
/* Closure for waiting on all reads and writes to complete */
|
||||
struct closure cl;
|
||||
|
||||
struct bch_move_stats *stats;
|
||||
|
||||
struct list_head reads;
|
||||
|
||||
/* in flight sectors: */
|
||||
atomic_t read_sectors;
|
||||
atomic_t write_sectors;
|
||||
|
||||
wait_queue_head_t wait;
|
||||
};
|
||||
|
||||
static int bch2_migrate_index_update(struct bch_write_op *op)
|
||||
{
|
||||
struct bch_fs *c = op->c;
|
||||
struct migrate_write *m =
|
||||
container_of(op, struct migrate_write, op);
|
||||
struct keylist *keys = &op->insert_keys;
|
||||
struct btree_iter iter;
|
||||
int ret = 0;
|
||||
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
|
||||
bkey_start_pos(&bch2_keylist_front(keys)->k),
|
||||
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
||||
|
||||
while (1) {
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
|
||||
struct bkey_i_extent *insert, *new =
|
||||
bkey_i_to_extent(bch2_keylist_front(keys));
|
||||
BKEY_PADDED(k) _new, _insert;
|
||||
struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
bool did_work = false;
|
||||
int nr;
|
||||
|
||||
if (btree_iter_err(k)) {
|
||||
ret = bch2_btree_iter_unlock(&iter);
|
||||
break;
|
||||
}
|
||||
|
||||
if (bversion_cmp(k.k->version, new->k.version) ||
|
||||
!bkey_extent_is_data(k.k) ||
|
||||
!bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
|
||||
m->ptr, m->offset))
|
||||
goto nomatch;
|
||||
|
||||
if (m->data_cmd == DATA_REWRITE &&
|
||||
!bch2_extent_has_device(bkey_s_c_to_extent(k),
|
||||
m->data_opts.rewrite_dev))
|
||||
goto nomatch;
|
||||
|
||||
bkey_reassemble(&_insert.k, k);
|
||||
insert = bkey_i_to_extent(&_insert.k);
|
||||
|
||||
bkey_copy(&_new.k, bch2_keylist_front(keys));
|
||||
new = bkey_i_to_extent(&_new.k);
|
||||
|
||||
bch2_cut_front(iter.pos, &insert->k_i);
|
||||
bch2_cut_back(new->k.p, &insert->k);
|
||||
bch2_cut_back(insert->k.p, &new->k);
|
||||
|
||||
if (m->data_cmd == DATA_REWRITE) {
|
||||
ptr = (struct bch_extent_ptr *)
|
||||
bch2_extent_has_device(extent_i_to_s_c(insert),
|
||||
m->data_opts.rewrite_dev);
|
||||
bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
|
||||
}
|
||||
|
||||
extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
|
||||
if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
|
||||
/*
|
||||
* raced with another move op? extent already
|
||||
* has a pointer to the device we just wrote
|
||||
* data to
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
bch2_extent_crc_append(insert, crc);
|
||||
extent_ptr_append(insert, *ptr);
|
||||
did_work = true;
|
||||
}
|
||||
|
||||
if (!did_work)
|
||||
goto nomatch;
|
||||
|
||||
bch2_extent_narrow_crcs(insert,
|
||||
(struct bch_extent_crc_unpacked) { 0 });
|
||||
bch2_extent_normalize(c, extent_i_to_s(insert).s);
|
||||
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
|
||||
op->opts.background_target,
|
||||
op->opts.data_replicas);
|
||||
|
||||
/*
|
||||
* It's possible we race, and for whatever reason the extent now
|
||||
* has fewer replicas than when we last looked at it - meaning
|
||||
* we need to get a disk reservation here:
|
||||
*/
|
||||
nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
|
||||
(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
|
||||
if (nr > 0) {
|
||||
/*
|
||||
* can't call bch2_disk_reservation_add() with btree
|
||||
* locks held, at least not without a song and dance
|
||||
*/
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
ret = bch2_disk_reservation_add(c, &op->res,
|
||||
keylist_sectors(keys) * nr, 0);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
m->nr_ptrs_reserved += nr;
|
||||
goto next;
|
||||
}
|
||||
|
||||
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
|
||||
extent_i_to_s_c(insert).s_c);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
ret = bch2_btree_insert_at(c, &op->res,
|
||||
NULL, op_journal_seq(op),
|
||||
BTREE_INSERT_ATOMIC|
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_USE_RESERVE|
|
||||
m->data_opts.btree_insert_flags,
|
||||
BTREE_INSERT_ENTRY(&iter, &insert->k_i));
|
||||
if (!ret)
|
||||
atomic_long_inc(&c->extent_migrate_done);
|
||||
if (ret == -EINTR)
|
||||
ret = 0;
|
||||
if (ret)
|
||||
break;
|
||||
next:
|
||||
while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
|
||||
bch2_keylist_pop_front(keys);
|
||||
if (bch2_keylist_empty(keys))
|
||||
goto out;
|
||||
}
|
||||
|
||||
bch2_cut_front(iter.pos, bch2_keylist_front(keys));
|
||||
continue;
|
||||
nomatch:
|
||||
if (m->ctxt)
|
||||
atomic64_add(k.k->p.offset - iter.pos.offset,
|
||||
&m->ctxt->stats->sectors_raced);
|
||||
atomic_long_inc(&c->extent_migrate_raced);
|
||||
trace_move_race(&new->k);
|
||||
bch2_btree_iter_next_slot(&iter);
|
||||
goto next;
|
||||
}
|
||||
out:
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
|
||||
{
|
||||
/* write bio must own pages: */
|
||||
BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
||||
|
||||
m->ptr = rbio->pick.ptr;
|
||||
m->offset = rbio->pos.offset - rbio->pick.crc.offset;
|
||||
m->op.devs_have = rbio->devs_have;
|
||||
m->op.pos = rbio->pos;
|
||||
m->op.version = rbio->version;
|
||||
m->op.crc = rbio->pick.crc;
|
||||
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
||||
|
||||
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
|
||||
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
|
||||
m->op.csum_type = m->op.crc.csum_type;
|
||||
}
|
||||
|
||||
if (m->data_cmd == DATA_REWRITE)
|
||||
bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
|
||||
}
|
||||
|
||||
int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
|
||||
struct write_point_specifier wp,
|
||||
struct bch_io_opts io_opts,
|
||||
enum data_cmd data_cmd,
|
||||
struct data_opts data_opts,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
int ret;
|
||||
|
||||
m->data_cmd = data_cmd;
|
||||
m->data_opts = data_opts;
|
||||
m->nr_ptrs_reserved = 0;
|
||||
|
||||
bch2_write_op_init(&m->op, c, io_opts);
|
||||
m->op.compression_type =
|
||||
bch2_compression_opt_to_type[io_opts.background_compression ?:
|
||||
io_opts.compression];
|
||||
m->op.target = data_opts.target,
|
||||
m->op.write_point = wp;
|
||||
|
||||
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
|
||||
m->op.alloc_reserve = RESERVE_MOVINGGC;
|
||||
|
||||
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
|
||||
BCH_WRITE_PAGES_STABLE|
|
||||
BCH_WRITE_PAGES_OWNED|
|
||||
BCH_WRITE_DATA_ENCODED|
|
||||
BCH_WRITE_NOMARK_REPLICAS;
|
||||
|
||||
m->op.nr_replicas = 1;
|
||||
m->op.nr_replicas_required = 1;
|
||||
m->op.index_update_fn = bch2_migrate_index_update;
|
||||
|
||||
switch (data_cmd) {
|
||||
case DATA_ADD_REPLICAS: {
|
||||
int nr = (int) io_opts.data_replicas -
|
||||
bch2_extent_nr_dirty_ptrs(k);
|
||||
|
||||
if (nr > 0) {
|
||||
m->op.nr_replicas = m->nr_ptrs_reserved = nr;
|
||||
|
||||
ret = bch2_disk_reservation_get(c, &m->op.res,
|
||||
k.k->size, m->op.nr_replicas, 0);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DATA_REWRITE:
|
||||
break;
|
||||
case DATA_PROMOTE:
|
||||
m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
|
||||
m->op.flags |= BCH_WRITE_CACHED;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void move_free(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
struct bvec_iter_all iter;
|
||||
struct bio_vec *bv;
|
||||
|
||||
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
|
||||
|
||||
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
|
||||
if (bv->bv_page)
|
||||
__free_page(bv->bv_page);
|
||||
|
||||
wake_up(&ctxt->wait);
|
||||
|
||||
kfree(io);
|
||||
}
|
||||
|
||||
static void move_write_done(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
|
||||
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
|
||||
closure_return_with_destructor(cl, move_free);
|
||||
}
|
||||
|
||||
static void move_write(struct closure *cl)
|
||||
{
|
||||
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
||||
|
||||
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
||||
closure_return_with_destructor(cl, move_free);
|
||||
return;
|
||||
}
|
||||
|
||||
bch2_migrate_read_done(&io->write, &io->rbio);
|
||||
|
||||
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
||||
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
|
||||
continue_at(cl, move_write_done, NULL);
|
||||
}
|
||||
|
||||
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
|
||||
{
|
||||
struct moving_io *io =
|
||||
list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
|
||||
|
||||
return io && io->read_completed ? io : NULL;
|
||||
}
|
||||
|
||||
static void move_read_endio(struct bio *bio)
|
||||
{
|
||||
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
||||
struct moving_context *ctxt = io->write.ctxt;
|
||||
|
||||
atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
||||
io->read_completed = true;
|
||||
|
||||
if (next_pending_write(ctxt))
|
||||
wake_up(&ctxt->wait);
|
||||
|
||||
closure_put(&ctxt->cl);
|
||||
}
|
||||
|
||||
static void do_pending_writes(struct moving_context *ctxt)
|
||||
{
|
||||
struct moving_io *io;
|
||||
|
||||
while ((io = next_pending_write(ctxt))) {
|
||||
list_del(&io->list);
|
||||
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
|
||||
}
|
||||
}
|
||||
|
||||
#define move_ctxt_wait_event(_ctxt, _cond) \
|
||||
do { \
|
||||
do_pending_writes(_ctxt); \
|
||||
\
|
||||
if (_cond) \
|
||||
break; \
|
||||
__wait_event((_ctxt)->wait, \
|
||||
next_pending_write(_ctxt) || (_cond)); \
|
||||
} while (1)
|
||||
|
||||
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
|
||||
{
|
||||
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
|
||||
|
||||
move_ctxt_wait_event(ctxt,
|
||||
!atomic_read(&ctxt->write_sectors) ||
|
||||
atomic_read(&ctxt->write_sectors) != sectors_pending);
|
||||
}
|
||||
|
||||
static int bch2_move_extent(struct bch_fs *c,
|
||||
struct moving_context *ctxt,
|
||||
struct write_point_specifier wp,
|
||||
struct bch_io_opts io_opts,
|
||||
struct bkey_s_c_extent e,
|
||||
enum data_cmd data_cmd,
|
||||
struct data_opts data_opts)
|
||||
{
|
||||
struct moving_io *io;
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
unsigned sectors = e.k->size, pages;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
move_ctxt_wait_event(ctxt,
|
||||
atomic_read(&ctxt->write_sectors) <
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
|
||||
move_ctxt_wait_event(ctxt,
|
||||
atomic_read(&ctxt->read_sectors) <
|
||||
SECTORS_IN_FLIGHT_PER_DEVICE);
|
||||
|
||||
/* write path might have to decompress data: */
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
sectors = max_t(unsigned, sectors, crc.uncompressed_size);
|
||||
|
||||
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
||||
io = kzalloc(sizeof(struct moving_io) +
|
||||
sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
||||
if (!io)
|
||||
goto err;
|
||||
|
||||
io->write.ctxt = ctxt;
|
||||
io->read_sectors = e.k->size;
|
||||
io->write_sectors = e.k->size;
|
||||
|
||||
bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
||||
bio_set_prio(&io->write.op.wbio.bio,
|
||||
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
|
||||
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
|
||||
GFP_KERNEL))
|
||||
goto err_free;
|
||||
|
||||
io->rbio.opts = io_opts;
|
||||
bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
||||
io->rbio.bio.bi_vcnt = pages;
|
||||
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
||||
io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
||||
|
||||
io->rbio.bio.bi_opf = REQ_OP_READ;
|
||||
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
|
||||
io->rbio.bio.bi_end_io = move_read_endio;
|
||||
|
||||
ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
|
||||
data_cmd, data_opts, e.s_c);
|
||||
if (ret)
|
||||
goto err_free_pages;
|
||||
|
||||
atomic64_inc(&ctxt->stats->keys_moved);
|
||||
atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
|
||||
|
||||
trace_move_extent(e.k);
|
||||
|
||||
atomic_add(io->read_sectors, &ctxt->read_sectors);
|
||||
list_add_tail(&io->list, &ctxt->reads);
|
||||
|
||||
/*
|
||||
* dropped by move_read_endio() - guards against use after free of
|
||||
* ctxt when doing wakeup
|
||||
*/
|
||||
closure_get(&ctxt->cl);
|
||||
bch2_read_extent(c, &io->rbio, e.s_c,
|
||||
BCH_READ_NODECODE|
|
||||
BCH_READ_LAST_FRAGMENT);
|
||||
return 0;
|
||||
err_free_pages:
|
||||
bio_free_pages(&io->write.op.wbio.bio);
|
||||
err_free:
|
||||
kfree(io);
|
||||
err:
|
||||
trace_move_alloc_fail(e.k);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_move_data(struct bch_fs *c,
|
||||
struct bch_ratelimit *rate,
|
||||
struct write_point_specifier wp,
|
||||
struct bpos start,
|
||||
struct bpos end,
|
||||
move_pred_fn pred, void *arg,
|
||||
struct bch_move_stats *stats)
|
||||
{
|
||||
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
||||
struct moving_context ctxt = { .stats = stats };
|
||||
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
||||
BKEY_PADDED(k) tmp;
|
||||
struct bkey_s_c k;
|
||||
struct bkey_s_c_extent e;
|
||||
struct data_opts data_opts;
|
||||
enum data_cmd data_cmd;
|
||||
u64 cur_inum = U64_MAX;
|
||||
int ret = 0, ret2;
|
||||
|
||||
closure_init_stack(&ctxt.cl);
|
||||
INIT_LIST_HEAD(&ctxt.reads);
|
||||
init_waitqueue_head(&ctxt.wait);
|
||||
|
||||
stats->data_type = BCH_DATA_USER;
|
||||
bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
|
||||
BTREE_ITER_PREFETCH);
|
||||
|
||||
if (rate)
|
||||
bch2_ratelimit_reset(rate);
|
||||
|
||||
while (!kthread || !(ret = kthread_should_stop())) {
|
||||
if (rate &&
|
||||
bch2_ratelimit_delay(rate) &&
|
||||
(bch2_btree_iter_unlock(&stats->iter),
|
||||
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
|
||||
break;
|
||||
peek:
|
||||
k = bch2_btree_iter_peek(&stats->iter);
|
||||
if (!k.k)
|
||||
break;
|
||||
ret = btree_iter_err(k);
|
||||
if (ret)
|
||||
break;
|
||||
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
|
||||
break;
|
||||
|
||||
if (!bkey_extent_is_data(k.k))
|
||||
goto next_nondata;
|
||||
|
||||
e = bkey_s_c_to_extent(k);
|
||||
|
||||
if (cur_inum != k.k->p.inode) {
|
||||
struct bch_inode_unpacked inode;
|
||||
|
||||
/* don't hold btree locks while looking up inode: */
|
||||
bch2_btree_iter_unlock(&stats->iter);
|
||||
|
||||
io_opts = bch2_opts_to_inode_opts(c->opts);
|
||||
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
|
||||
bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
|
||||
cur_inum = k.k->p.inode;
|
||||
goto peek;
|
||||
}
|
||||
|
||||
switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
|
||||
&io_opts, &data_opts))) {
|
||||
case DATA_SKIP:
|
||||
goto next;
|
||||
case DATA_SCRUB:
|
||||
BUG();
|
||||
case DATA_ADD_REPLICAS:
|
||||
case DATA_REWRITE:
|
||||
case DATA_PROMOTE:
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* unlock before doing IO: */
|
||||
bkey_reassemble(&tmp.k, k);
|
||||
k = bkey_i_to_s_c(&tmp.k);
|
||||
bch2_btree_iter_unlock(&stats->iter);
|
||||
|
||||
ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
|
||||
bkey_s_c_to_extent(k),
|
||||
data_cmd, data_opts);
|
||||
if (ret2) {
|
||||
if (ret2 == -ENOMEM) {
|
||||
/* memory allocation failure, wait for some IO to finish */
|
||||
bch2_move_ctxt_wait_for_io(&ctxt);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* XXX signal failure */
|
||||
goto next;
|
||||
}
|
||||
|
||||
if (rate)
|
||||
bch2_ratelimit_increment(rate, k.k->size);
|
||||
next:
|
||||
atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
|
||||
&stats->sectors_seen);
|
||||
next_nondata:
|
||||
bch2_btree_iter_next(&stats->iter);
|
||||
bch2_btree_iter_cond_resched(&stats->iter);
|
||||
}
|
||||
|
||||
bch2_btree_iter_unlock(&stats->iter);
|
||||
|
||||
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
|
||||
closure_sync(&ctxt.cl);
|
||||
|
||||
EBUG_ON(atomic_read(&ctxt.write_sectors));
|
||||
|
||||
trace_move_data(c,
|
||||
atomic64_read(&stats->sectors_moved),
|
||||
atomic64_read(&stats->keys_moved));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_data_replicas(struct bch_fs *c)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
|
||||
BTREE_ITER_PREFETCH, k) {
|
||||
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
ret = bch2_btree_iter_unlock(&iter) ?: ret;
|
||||
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_btree_replicas(struct bch_fs *c)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct btree *b;
|
||||
unsigned id;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&c->replicas_gc_lock);
|
||||
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
|
||||
|
||||
for (id = 0; id < BTREE_ID_NR; id++) {
|
||||
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
|
||||
ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
|
||||
bkey_i_to_s_c(&b->key));
|
||||
|
||||
bch2_btree_iter_cond_resched(&iter);
|
||||
}
|
||||
|
||||
ret = bch2_btree_iter_unlock(&iter) ?: ret;
|
||||
}
|
||||
|
||||
bch2_replicas_gc_end(c, ret);
|
||||
mutex_unlock(&c->replicas_gc_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_move_btree(struct bch_fs *c,
|
||||
move_pred_fn pred,
|
||||
void *arg,
|
||||
struct bch_move_stats *stats)
|
||||
{
|
||||
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
||||
struct btree *b;
|
||||
unsigned id;
|
||||
struct data_opts data_opts;
|
||||
enum data_cmd cmd;
|
||||
int ret = 0;
|
||||
|
||||
stats->data_type = BCH_DATA_BTREE;
|
||||
|
||||
for (id = 0; id < BTREE_ID_NR; id++) {
|
||||
for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
|
||||
switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
|
||||
bkey_i_to_s_c_extent(&b->key),
|
||||
&io_opts,
|
||||
&data_opts))) {
|
||||
case DATA_SKIP:
|
||||
goto next;
|
||||
case DATA_SCRUB:
|
||||
BUG();
|
||||
case DATA_ADD_REPLICAS:
|
||||
case DATA_REWRITE:
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
ret = bch2_btree_node_rewrite(c, &stats->iter,
|
||||
b->data->keys.seq, 0) ?: ret;
|
||||
next:
|
||||
bch2_btree_iter_cond_resched(&stats->iter);
|
||||
}
|
||||
|
||||
ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
return DATA_SCRUB;
|
||||
}
|
||||
#endif
|
||||
|
||||
static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
unsigned nr_good = bch2_extent_durability(c, e);
|
||||
unsigned replicas = type == BKEY_TYPE_BTREE
|
||||
? c->opts.metadata_replicas
|
||||
: io_opts->data_replicas;
|
||||
|
||||
if (!nr_good || nr_good >= replicas)
|
||||
return DATA_SKIP;
|
||||
|
||||
data_opts->target = 0;
|
||||
data_opts->btree_insert_flags = 0;
|
||||
return DATA_ADD_REPLICAS;
|
||||
}
|
||||
|
||||
static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
struct bch_ioctl_data *op = arg;
|
||||
|
||||
if (!bch2_extent_has_device(e, op->migrate.dev))
|
||||
return DATA_SKIP;
|
||||
|
||||
data_opts->target = 0;
|
||||
data_opts->btree_insert_flags = 0;
|
||||
data_opts->rewrite_dev = op->migrate.dev;
|
||||
return DATA_REWRITE;
|
||||
}
|
||||
|
||||
int bch2_data_job(struct bch_fs *c,
|
||||
struct bch_move_stats *stats,
|
||||
struct bch_ioctl_data op)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
switch (op.op) {
|
||||
case BCH_DATA_OP_REREPLICATE:
|
||||
stats->data_type = BCH_DATA_JOURNAL;
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
||||
|
||||
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
|
||||
ret = bch2_gc_btree_replicas(c) ?: ret;
|
||||
|
||||
ret = bch2_move_data(c, NULL,
|
||||
writepoint_hashed((unsigned long) current),
|
||||
op.start,
|
||||
op.end,
|
||||
rereplicate_pred, c, stats) ?: ret;
|
||||
ret = bch2_gc_data_replicas(c) ?: ret;
|
||||
break;
|
||||
case BCH_DATA_OP_MIGRATE:
|
||||
if (op.migrate.dev >= c->sb.nr_devices)
|
||||
return -EINVAL;
|
||||
|
||||
stats->data_type = BCH_DATA_JOURNAL;
|
||||
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
|
||||
|
||||
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
|
||||
ret = bch2_gc_btree_replicas(c) ?: ret;
|
||||
|
||||
ret = bch2_move_data(c, NULL,
|
||||
writepoint_hashed((unsigned long) current),
|
||||
op.start,
|
||||
op.end,
|
||||
migrate_pred, &op, stats) ?: ret;
|
||||
ret = bch2_gc_data_replicas(c) ?: ret;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
63
fs/bcachefs/move.h
Normal file
63
fs/bcachefs/move.h
Normal file
@ -0,0 +1,63 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_MOVE_H
|
||||
#define _BCACHEFS_MOVE_H
|
||||
|
||||
#include "btree_iter.h"
|
||||
#include "buckets.h"
|
||||
#include "io_types.h"
|
||||
#include "move_types.h"
|
||||
|
||||
struct bch_read_bio;
|
||||
struct moving_context;
|
||||
|
||||
enum data_cmd {
|
||||
DATA_SKIP,
|
||||
DATA_SCRUB,
|
||||
DATA_ADD_REPLICAS,
|
||||
DATA_REWRITE,
|
||||
DATA_PROMOTE,
|
||||
};
|
||||
|
||||
struct data_opts {
|
||||
u16 target;
|
||||
unsigned rewrite_dev;
|
||||
int btree_insert_flags;
|
||||
};
|
||||
|
||||
struct migrate_write {
|
||||
enum data_cmd data_cmd;
|
||||
struct data_opts data_opts;
|
||||
|
||||
unsigned nr_ptrs_reserved;
|
||||
|
||||
struct moving_context *ctxt;
|
||||
|
||||
/* what we read: */
|
||||
struct bch_extent_ptr ptr;
|
||||
u64 offset;
|
||||
|
||||
struct bch_write_op op;
|
||||
};
|
||||
|
||||
void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
|
||||
int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
|
||||
struct write_point_specifier,
|
||||
struct bch_io_opts,
|
||||
enum data_cmd, struct data_opts,
|
||||
struct bkey_s_c);
|
||||
|
||||
typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
|
||||
enum bkey_type, struct bkey_s_c_extent,
|
||||
struct bch_io_opts *, struct data_opts *);
|
||||
|
||||
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
|
||||
struct write_point_specifier,
|
||||
struct bpos, struct bpos,
|
||||
move_pred_fn, void *,
|
||||
struct bch_move_stats *);
|
||||
|
||||
int bch2_data_job(struct bch_fs *,
|
||||
struct bch_move_stats *,
|
||||
struct bch_ioctl_data);
|
||||
|
||||
#endif /* _BCACHEFS_MOVE_H */
|
15
fs/bcachefs/move_types.h
Normal file
15
fs/bcachefs/move_types.h
Normal file
@ -0,0 +1,15 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_MOVE_TYPES_H
|
||||
#define _BCACHEFS_MOVE_TYPES_H
|
||||
|
||||
struct bch_move_stats {
|
||||
enum bch_data_type data_type;
|
||||
struct btree_iter iter;
|
||||
|
||||
atomic64_t keys_moved;
|
||||
atomic64_t sectors_moved;
|
||||
atomic64_t sectors_seen;
|
||||
atomic64_t sectors_raced;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_MOVE_TYPES_H */
|
309
fs/bcachefs/movinggc.c
Normal file
309
fs/bcachefs/movinggc.c
Normal file
@ -0,0 +1,309 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Moving/copying garbage collector
|
||||
*
|
||||
* Copyright 2012 Google, Inc.
|
||||
*/
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "btree_iter.h"
|
||||
#include "btree_update.h"
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "disk_groups.h"
|
||||
#include "extents.h"
|
||||
#include "eytzinger.h"
|
||||
#include "io.h"
|
||||
#include "keylist.h"
|
||||
#include "move.h"
|
||||
#include "movinggc.h"
|
||||
#include "super-io.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/math64.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
/*
|
||||
* We can't use the entire copygc reserve in one iteration of copygc: we may
|
||||
* need the buckets we're freeing up to go back into the copygc reserve to make
|
||||
* forward progress, but if the copygc reserve is full they'll be available for
|
||||
* any allocation - and it's possible that in a given iteration, we free up most
|
||||
* of the buckets we're going to free before we allocate most of the buckets
|
||||
* we're going to allocate.
|
||||
*
|
||||
* If we only use half of the reserve per iteration, then in steady state we'll
|
||||
* always have room in the reserve for the buckets we're going to need in the
|
||||
* next iteration:
|
||||
*/
|
||||
#define COPYGC_BUCKETS_PER_ITER(ca) \
|
||||
((ca)->free[RESERVE_MOVINGGC].size / 2)
|
||||
|
||||
/*
|
||||
* Max sectors to move per iteration: Have to take into account internal
|
||||
* fragmentation from the multiple write points for each generation:
|
||||
*/
|
||||
#define COPYGC_SECTORS_PER_ITER(ca) \
|
||||
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
|
||||
|
||||
static inline int sectors_used_cmp(copygc_heap *heap,
|
||||
struct copygc_heap_entry l,
|
||||
struct copygc_heap_entry r)
|
||||
{
|
||||
return (l.sectors > r.sectors) - (l.sectors < r.sectors);
|
||||
}
|
||||
|
||||
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
|
||||
{
|
||||
const struct copygc_heap_entry *l = _l;
|
||||
const struct copygc_heap_entry *r = _r;
|
||||
|
||||
return (l->offset > r->offset) - (l->offset < r->offset);
|
||||
}
|
||||
|
||||
static bool __copygc_pred(struct bch_dev *ca,
|
||||
struct bkey_s_c_extent e)
|
||||
{
|
||||
copygc_heap *h = &ca->copygc_heap;
|
||||
const struct bch_extent_ptr *ptr =
|
||||
bch2_extent_has_device(e, ca->dev_idx);
|
||||
|
||||
if (ptr) {
|
||||
struct copygc_heap_entry search = { .offset = ptr->offset };
|
||||
|
||||
ssize_t i = eytzinger0_find_le(h->data, h->used,
|
||||
sizeof(h->data[0]),
|
||||
bucket_offset_cmp, &search);
|
||||
|
||||
return (i >= 0 &&
|
||||
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
|
||||
ptr->gen == h->data[i].gen);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
struct bch_dev *ca = arg;
|
||||
|
||||
if (!__copygc_pred(ca, e))
|
||||
return DATA_SKIP;
|
||||
|
||||
data_opts->target = dev_to_target(ca->dev_idx);
|
||||
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
|
||||
data_opts->rewrite_dev = ca->dev_idx;
|
||||
return DATA_REWRITE;
|
||||
}
|
||||
|
||||
static bool have_copygc_reserve(struct bch_dev *ca)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
spin_lock(&ca->freelist_lock);
|
||||
ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
|
||||
ca->allocator_blocked;
|
||||
spin_unlock(&ca->freelist_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
copygc_heap *h = &ca->copygc_heap;
|
||||
struct copygc_heap_entry e, *i;
|
||||
struct bucket_array *buckets;
|
||||
struct bch_move_stats move_stats;
|
||||
u64 sectors_to_move = 0, sectors_not_moved = 0;
|
||||
u64 buckets_to_move, buckets_not_moved = 0;
|
||||
size_t b;
|
||||
int ret;
|
||||
|
||||
memset(&move_stats, 0, sizeof(move_stats));
|
||||
closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
|
||||
|
||||
/*
|
||||
* Find buckets with lowest sector counts, skipping completely
|
||||
* empty buckets, by building a maxheap sorted by sector count,
|
||||
* and repeatedly replacing the maximum element until all
|
||||
* buckets have been visited.
|
||||
*/
|
||||
h->used = 0;
|
||||
|
||||
/*
|
||||
* We need bucket marks to be up to date - gc can't be recalculating
|
||||
* them:
|
||||
*/
|
||||
down_read(&c->gc_lock);
|
||||
down_read(&ca->bucket_lock);
|
||||
buckets = bucket_array(ca);
|
||||
|
||||
for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
|
||||
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
|
||||
struct copygc_heap_entry e;
|
||||
|
||||
if (m.owned_by_allocator ||
|
||||
m.data_type != BCH_DATA_USER ||
|
||||
!bucket_sectors_used(m) ||
|
||||
bucket_sectors_used(m) >= ca->mi.bucket_size)
|
||||
continue;
|
||||
|
||||
e = (struct copygc_heap_entry) {
|
||||
.gen = m.gen,
|
||||
.sectors = bucket_sectors_used(m),
|
||||
.offset = bucket_to_sector(ca, b),
|
||||
};
|
||||
heap_add_or_replace(h, e, -sectors_used_cmp);
|
||||
}
|
||||
up_read(&ca->bucket_lock);
|
||||
up_read(&c->gc_lock);
|
||||
|
||||
for (i = h->data; i < h->data + h->used; i++)
|
||||
sectors_to_move += i->sectors;
|
||||
|
||||
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
|
||||
BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
|
||||
sectors_to_move -= e.sectors;
|
||||
}
|
||||
|
||||
buckets_to_move = h->used;
|
||||
|
||||
if (!buckets_to_move)
|
||||
return;
|
||||
|
||||
eytzinger0_sort(h->data, h->used,
|
||||
sizeof(h->data[0]),
|
||||
bucket_offset_cmp, NULL);
|
||||
|
||||
ret = bch2_move_data(c, &ca->copygc_pd.rate,
|
||||
writepoint_ptr(&ca->copygc_write_point),
|
||||
POS_MIN, POS_MAX,
|
||||
copygc_pred, ca,
|
||||
&move_stats);
|
||||
|
||||
down_read(&ca->bucket_lock);
|
||||
buckets = bucket_array(ca);
|
||||
for (i = h->data; i < h->data + h->used; i++) {
|
||||
size_t b = sector_to_bucket(ca, i->offset);
|
||||
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
|
||||
|
||||
if (i->gen == m.gen && bucket_sectors_used(m)) {
|
||||
sectors_not_moved += bucket_sectors_used(m);
|
||||
buckets_not_moved++;
|
||||
}
|
||||
}
|
||||
up_read(&ca->bucket_lock);
|
||||
|
||||
if (sectors_not_moved && !ret)
|
||||
bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
|
||||
sectors_not_moved, sectors_to_move,
|
||||
buckets_not_moved, buckets_to_move);
|
||||
|
||||
trace_copygc(ca,
|
||||
atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
|
||||
buckets_to_move, buckets_not_moved);
|
||||
}
|
||||
|
||||
static int bch2_copygc_thread(void *arg)
|
||||
{
|
||||
struct bch_dev *ca = arg;
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct io_clock *clock = &c->io_clock[WRITE];
|
||||
struct bch_dev_usage usage;
|
||||
unsigned long last;
|
||||
u64 available, fragmented, reserve, next;
|
||||
|
||||
set_freezable();
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
if (kthread_wait_freezable(c->copy_gc_enabled))
|
||||
break;
|
||||
|
||||
last = atomic_long_read(&clock->now);
|
||||
|
||||
reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
|
||||
ca->mi.bucket_size *
|
||||
c->opts.gc_reserve_percent, 200);
|
||||
|
||||
usage = bch2_dev_usage_read(c, ca);
|
||||
|
||||
/*
|
||||
* don't start copygc until less than half the gc reserve is
|
||||
* available:
|
||||
*/
|
||||
available = __dev_buckets_available(ca, usage) *
|
||||
ca->mi.bucket_size;
|
||||
if (available > reserve) {
|
||||
next = last + available - reserve;
|
||||
bch2_kthread_io_clock_wait(clock, next,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* don't start copygc until there's more than half the copygc
|
||||
* reserve of fragmented space:
|
||||
*/
|
||||
fragmented = usage.sectors_fragmented;
|
||||
if (fragmented < reserve) {
|
||||
next = last + reserve - fragmented;
|
||||
bch2_kthread_io_clock_wait(clock, next,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
continue;
|
||||
}
|
||||
|
||||
bch2_copygc(c, ca);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_copygc_stop(struct bch_dev *ca)
|
||||
{
|
||||
ca->copygc_pd.rate.rate = UINT_MAX;
|
||||
bch2_ratelimit_reset(&ca->copygc_pd.rate);
|
||||
|
||||
if (ca->copygc_thread) {
|
||||
kthread_stop(ca->copygc_thread);
|
||||
put_task_struct(ca->copygc_thread);
|
||||
}
|
||||
ca->copygc_thread = NULL;
|
||||
}
|
||||
|
||||
int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct task_struct *t;
|
||||
|
||||
BUG_ON(ca->copygc_thread);
|
||||
|
||||
if (c->opts.nochanges)
|
||||
return 0;
|
||||
|
||||
if (bch2_fs_init_fault("copygc_start"))
|
||||
return -ENOMEM;
|
||||
|
||||
t = kthread_create(bch2_copygc_thread, ca,
|
||||
"bch_copygc[%s]", ca->name);
|
||||
if (IS_ERR(t))
|
||||
return PTR_ERR(t);
|
||||
|
||||
get_task_struct(t);
|
||||
|
||||
ca->copygc_thread = t;
|
||||
wake_up_process(ca->copygc_thread);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_dev_copygc_init(struct bch_dev *ca)
|
||||
{
|
||||
bch2_pd_controller_init(&ca->copygc_pd);
|
||||
ca->copygc_pd.d_term = 0;
|
||||
}
|
9
fs/bcachefs/movinggc.h
Normal file
9
fs/bcachefs/movinggc.h
Normal file
@ -0,0 +1,9 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_MOVINGGC_H
|
||||
#define _BCACHEFS_MOVINGGC_H
|
||||
|
||||
void bch2_copygc_stop(struct bch_dev *);
|
||||
int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
|
||||
void bch2_dev_copygc_init(struct bch_dev *);
|
||||
|
||||
#endif /* _BCACHEFS_MOVINGGC_H */
|
381
fs/bcachefs/opts.c
Normal file
381
fs/bcachefs/opts.c
Normal file
@ -0,0 +1,381 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "disk_groups.h"
|
||||
#include "opts.h"
|
||||
#include "super-io.h"
|
||||
#include "util.h"
|
||||
|
||||
const char * const bch2_error_actions[] = {
|
||||
"continue",
|
||||
"remount-ro",
|
||||
"panic",
|
||||
NULL
|
||||
};
|
||||
|
||||
const char * const bch2_csum_types[] = {
|
||||
"none",
|
||||
"crc32c",
|
||||
"crc64",
|
||||
NULL
|
||||
};
|
||||
|
||||
const char * const bch2_compression_types[] = {
|
||||
"none",
|
||||
"lz4",
|
||||
"gzip",
|
||||
"zstd",
|
||||
NULL
|
||||
};
|
||||
|
||||
const char * const bch2_str_hash_types[] = {
|
||||
"crc32c",
|
||||
"crc64",
|
||||
"siphash",
|
||||
NULL
|
||||
};
|
||||
|
||||
const char * const bch2_data_types[] = {
|
||||
"none",
|
||||
"sb",
|
||||
"journal",
|
||||
"btree",
|
||||
"data",
|
||||
"cached",
|
||||
NULL
|
||||
};
|
||||
|
||||
const char * const bch2_cache_replacement_policies[] = {
|
||||
"lru",
|
||||
"fifo",
|
||||
"random",
|
||||
NULL
|
||||
};
|
||||
|
||||
/* Default is -1; we skip past it for struct cached_dev's cache mode */
|
||||
const char * const bch2_cache_modes[] = {
|
||||
"default",
|
||||
"writethrough",
|
||||
"writeback",
|
||||
"writearound",
|
||||
"none",
|
||||
NULL
|
||||
};
|
||||
|
||||
const char * const bch2_dev_state[] = {
|
||||
"readwrite",
|
||||
"readonly",
|
||||
"failed",
|
||||
"spare",
|
||||
NULL
|
||||
};
|
||||
|
||||
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
|
||||
{
|
||||
#define BCH_OPT(_name, ...) \
|
||||
if (opt_defined(src, _name)) \
|
||||
opt_set(*dst, _name, src._name);
|
||||
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
}
|
||||
|
||||
bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
|
||||
{
|
||||
switch (id) {
|
||||
#define BCH_OPT(_name, ...) \
|
||||
case Opt_##_name: \
|
||||
return opt_defined(*opts, _name);
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
|
||||
{
|
||||
switch (id) {
|
||||
#define BCH_OPT(_name, ...) \
|
||||
case Opt_##_name: \
|
||||
return opts->_name;
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
|
||||
{
|
||||
switch (id) {
|
||||
#define BCH_OPT(_name, ...) \
|
||||
case Opt_##_name: \
|
||||
opt_set(*opts, _name, v); \
|
||||
break;
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initial options from superblock - here we don't want any options undefined,
|
||||
* any options the superblock doesn't specify are set to 0:
|
||||
*/
|
||||
struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
|
||||
{
|
||||
struct bch_opts opts = bch2_opts_empty();
|
||||
|
||||
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
|
||||
if (_sb_opt != NO_SB_OPT) \
|
||||
opt_set(opts, _name, _sb_opt(sb));
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
|
||||
return opts;
|
||||
}
|
||||
|
||||
const struct bch_option bch2_opt_table[] = {
|
||||
#define OPT_BOOL() .type = BCH_OPT_BOOL
|
||||
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max
|
||||
#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices
|
||||
#define OPT_FN(_fn) .type = BCH_OPT_FN, \
|
||||
.parse = _fn##_parse, \
|
||||
.print = _fn##_print
|
||||
|
||||
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
|
||||
[Opt_##_name] = { \
|
||||
.attr = { \
|
||||
.name = #_name, \
|
||||
.mode = _mode == OPT_RUNTIME ? 0644 : 0444, \
|
||||
}, \
|
||||
.mode = _mode, \
|
||||
.set_sb = SET_##_sb_opt, \
|
||||
_type \
|
||||
},
|
||||
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
};
|
||||
|
||||
int bch2_opt_lookup(const char *name)
|
||||
{
|
||||
const struct bch_option *i;
|
||||
|
||||
for (i = bch2_opt_table;
|
||||
i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
|
||||
i++)
|
||||
if (!strcmp(name, i->attr.name))
|
||||
return i - bch2_opt_table;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct synonym {
|
||||
const char *s1, *s2;
|
||||
};
|
||||
|
||||
static const struct synonym bch_opt_synonyms[] = {
|
||||
{ "quota", "usrquota" },
|
||||
};
|
||||
|
||||
static int bch2_mount_opt_lookup(const char *name)
|
||||
{
|
||||
const struct synonym *i;
|
||||
|
||||
for (i = bch_opt_synonyms;
|
||||
i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
|
||||
i++)
|
||||
if (!strcmp(name, i->s1))
|
||||
name = i->s2;
|
||||
|
||||
return bch2_opt_lookup(name);
|
||||
}
|
||||
|
||||
int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
|
||||
const char *val, u64 *res)
|
||||
{
|
||||
ssize_t ret;
|
||||
|
||||
switch (opt->type) {
|
||||
case BCH_OPT_BOOL:
|
||||
ret = kstrtou64(val, 10, res);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (*res > 1)
|
||||
return -ERANGE;
|
||||
break;
|
||||
case BCH_OPT_UINT:
|
||||
ret = kstrtou64(val, 10, res);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (*res < opt->min || *res >= opt->max)
|
||||
return -ERANGE;
|
||||
break;
|
||||
case BCH_OPT_STR:
|
||||
ret = match_string(opt->choices, -1, val);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
*res = ret;
|
||||
break;
|
||||
case BCH_OPT_FN:
|
||||
if (!c)
|
||||
return -EINVAL;
|
||||
|
||||
return opt->parse(c, val, res);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
|
||||
const struct bch_option *opt, u64 v,
|
||||
unsigned flags)
|
||||
{
|
||||
char *out = buf, *end = buf + len;
|
||||
|
||||
if (flags & OPT_SHOW_MOUNT_STYLE) {
|
||||
if (opt->type == BCH_OPT_BOOL)
|
||||
return scnprintf(out, end - out, "%s%s",
|
||||
v ? "" : "no",
|
||||
opt->attr.name);
|
||||
|
||||
out += scnprintf(out, end - out, "%s=", opt->attr.name);
|
||||
}
|
||||
|
||||
switch (opt->type) {
|
||||
case BCH_OPT_BOOL:
|
||||
case BCH_OPT_UINT:
|
||||
out += scnprintf(out, end - out, "%lli", v);
|
||||
break;
|
||||
case BCH_OPT_STR:
|
||||
out += (flags & OPT_SHOW_FULL_LIST)
|
||||
? bch2_scnprint_string_list(out, end - out, opt->choices, v)
|
||||
: scnprintf(out, end - out, opt->choices[v]);
|
||||
break;
|
||||
case BCH_OPT_FN:
|
||||
return opt->print(c, out, end - out, v);
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
|
||||
{
|
||||
char *opt, *name, *val;
|
||||
int ret, id;
|
||||
u64 v;
|
||||
|
||||
while ((opt = strsep(&options, ",")) != NULL) {
|
||||
name = strsep(&opt, "=");
|
||||
val = opt;
|
||||
|
||||
if (val) {
|
||||
id = bch2_mount_opt_lookup(name);
|
||||
if (id < 0)
|
||||
goto bad_opt;
|
||||
|
||||
ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
|
||||
if (ret < 0)
|
||||
goto bad_val;
|
||||
} else {
|
||||
id = bch2_mount_opt_lookup(name);
|
||||
v = 1;
|
||||
|
||||
if (id < 0 &&
|
||||
!strncmp("no", name, 2)) {
|
||||
id = bch2_mount_opt_lookup(name + 2);
|
||||
v = 0;
|
||||
}
|
||||
|
||||
if (id < 0)
|
||||
goto bad_opt;
|
||||
|
||||
if (bch2_opt_table[id].type != BCH_OPT_BOOL)
|
||||
goto no_val;
|
||||
}
|
||||
|
||||
if (bch2_opt_table[id].mode < OPT_MOUNT)
|
||||
goto bad_opt;
|
||||
|
||||
if (id == Opt_acl &&
|
||||
!IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
|
||||
goto bad_opt;
|
||||
|
||||
if ((id == Opt_usrquota ||
|
||||
id == Opt_grpquota) &&
|
||||
!IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
|
||||
goto bad_opt;
|
||||
|
||||
bch2_opt_set_by_id(opts, id, v);
|
||||
}
|
||||
|
||||
return 0;
|
||||
bad_opt:
|
||||
pr_err("Bad mount option %s", name);
|
||||
return -1;
|
||||
bad_val:
|
||||
pr_err("Invalid value %s for mount option %s", val, name);
|
||||
return -1;
|
||||
no_val:
|
||||
pr_err("Mount option %s requires a value", name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* io opts: */
|
||||
|
||||
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
|
||||
{
|
||||
struct bch_io_opts ret = { 0 };
|
||||
#define BCH_INODE_OPT(_name, _bits) \
|
||||
if (opt_defined(src, _name)) \
|
||||
opt_set(ret, _name, src._name);
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
|
||||
{
|
||||
struct bch_opts ret = { 0 };
|
||||
#define BCH_INODE_OPT(_name, _bits) \
|
||||
if (opt_defined(src, _name)) \
|
||||
opt_set(ret, _name, src._name);
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
|
||||
{
|
||||
#define BCH_INODE_OPT(_name, _bits) \
|
||||
if (opt_defined(src, _name)) \
|
||||
opt_set(*dst, _name, src._name);
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
}
|
||||
|
||||
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
|
||||
{
|
||||
static const enum bch_opt_id inode_opt_list[] = {
|
||||
#define BCH_INODE_OPT(_name, _bits) Opt_##_name,
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
};
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
|
||||
if (inode_opt_list[i] == id)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
296
fs/bcachefs/opts.h
Normal file
296
fs/bcachefs/opts.h
Normal file
@ -0,0 +1,296 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_OPTS_H
|
||||
#define _BCACHEFS_OPTS_H
|
||||
|
||||
#include <linux/bug.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include "bcachefs_format.h"
|
||||
|
||||
extern const char * const bch2_error_actions[];
|
||||
extern const char * const bch2_csum_types[];
|
||||
extern const char * const bch2_compression_types[];
|
||||
extern const char * const bch2_str_hash_types[];
|
||||
extern const char * const bch2_data_types[];
|
||||
extern const char * const bch2_cache_replacement_policies[];
|
||||
extern const char * const bch2_cache_modes[];
|
||||
extern const char * const bch2_dev_state[];
|
||||
|
||||
/*
|
||||
* Mount options; we also store defaults in the superblock.
|
||||
*
|
||||
* Also exposed via sysfs: if an option is writeable, and it's also stored in
|
||||
* the superblock, changing it via sysfs (currently? might change this) also
|
||||
* updates the superblock.
|
||||
*
|
||||
* We store options as signed integers, where -1 means undefined. This means we
|
||||
* can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
|
||||
* apply the options from that struct that are defined.
|
||||
*/
|
||||
|
||||
/* dummy option, for options that aren't stored in the superblock */
|
||||
LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
|
||||
|
||||
enum opt_mode {
|
||||
OPT_INTERNAL,
|
||||
OPT_FORMAT,
|
||||
OPT_MOUNT,
|
||||
OPT_RUNTIME,
|
||||
};
|
||||
|
||||
enum opt_type {
|
||||
BCH_OPT_BOOL,
|
||||
BCH_OPT_UINT,
|
||||
BCH_OPT_STR,
|
||||
BCH_OPT_FN,
|
||||
};
|
||||
|
||||
/**
|
||||
* BCH_OPT(name, type, in mem type, mode, sb_opt)
|
||||
*
|
||||
* @name - name of mount option, sysfs attribute, and struct bch_opts
|
||||
* member
|
||||
*
|
||||
* @mode - when opt may be set
|
||||
*
|
||||
* @sb_option - name of corresponding superblock option
|
||||
*
|
||||
* @type - one of OPT_BOOL, OPT_UINT, OPT_STR
|
||||
*/
|
||||
|
||||
/*
|
||||
* XXX: add fields for
|
||||
* - default value
|
||||
* - helptext
|
||||
*/
|
||||
|
||||
#define BCH_OPTS() \
|
||||
BCH_OPT(block_size, u16, OPT_FORMAT, \
|
||||
OPT_UINT(1, 128), \
|
||||
BCH_SB_BLOCK_SIZE, 8) \
|
||||
BCH_OPT(btree_node_size, u16, OPT_FORMAT, \
|
||||
OPT_UINT(1, 128), \
|
||||
BCH_SB_BTREE_NODE_SIZE, 512) \
|
||||
BCH_OPT(errors, u8, OPT_RUNTIME, \
|
||||
OPT_STR(bch2_error_actions), \
|
||||
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO) \
|
||||
BCH_OPT(metadata_replicas, u8, OPT_RUNTIME, \
|
||||
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
||||
BCH_SB_META_REPLICAS_WANT, 1) \
|
||||
BCH_OPT(data_replicas, u8, OPT_RUNTIME, \
|
||||
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
||||
BCH_SB_DATA_REPLICAS_WANT, 1) \
|
||||
BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT, \
|
||||
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
||||
BCH_SB_META_REPLICAS_REQ, 1) \
|
||||
BCH_OPT(data_replicas_required, u8, OPT_MOUNT, \
|
||||
OPT_UINT(1, BCH_REPLICAS_MAX), \
|
||||
BCH_SB_DATA_REPLICAS_REQ, 1) \
|
||||
BCH_OPT(metadata_checksum, u8, OPT_RUNTIME, \
|
||||
OPT_STR(bch2_csum_types), \
|
||||
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C) \
|
||||
BCH_OPT(data_checksum, u8, OPT_RUNTIME, \
|
||||
OPT_STR(bch2_csum_types), \
|
||||
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C) \
|
||||
BCH_OPT(compression, u8, OPT_RUNTIME, \
|
||||
OPT_STR(bch2_compression_types), \
|
||||
BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE)\
|
||||
BCH_OPT(background_compression, u8, OPT_RUNTIME, \
|
||||
OPT_STR(bch2_compression_types), \
|
||||
BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
|
||||
BCH_OPT(str_hash, u8, OPT_RUNTIME, \
|
||||
OPT_STR(bch2_str_hash_types), \
|
||||
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH) \
|
||||
BCH_OPT(foreground_target, u16, OPT_RUNTIME, \
|
||||
OPT_FN(bch2_opt_target), \
|
||||
BCH_SB_FOREGROUND_TARGET, 0) \
|
||||
BCH_OPT(background_target, u16, OPT_RUNTIME, \
|
||||
OPT_FN(bch2_opt_target), \
|
||||
BCH_SB_BACKGROUND_TARGET, 0) \
|
||||
BCH_OPT(promote_target, u16, OPT_RUNTIME, \
|
||||
OPT_FN(bch2_opt_target), \
|
||||
BCH_SB_PROMOTE_TARGET, 0) \
|
||||
BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_INODE_32BIT, false) \
|
||||
BCH_OPT(gc_reserve_percent, u8, OPT_MOUNT, \
|
||||
OPT_UINT(5, 21), \
|
||||
BCH_SB_GC_RESERVE, 8) \
|
||||
BCH_OPT(root_reserve_percent, u8, OPT_MOUNT, \
|
||||
OPT_UINT(0, 100), \
|
||||
BCH_SB_ROOT_RESERVE, 0) \
|
||||
BCH_OPT(wide_macs, u8, OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_128_BIT_MACS, false) \
|
||||
BCH_OPT(acl, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_POSIX_ACL, true) \
|
||||
BCH_OPT(usrquota, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_USRQUOTA, false) \
|
||||
BCH_OPT(grpquota, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_GRPQUOTA, false) \
|
||||
BCH_OPT(prjquota, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
BCH_SB_PRJQUOTA, false) \
|
||||
BCH_OPT(degraded, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(discard, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(verbose_init, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(journal_flush_disabled, u8, OPT_RUNTIME, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(nofsck, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(fix_errors, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(nochanges, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(noreplay, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(norecovery, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(noexcl, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(sb, u64, OPT_MOUNT, \
|
||||
OPT_UINT(0, S64_MAX), \
|
||||
NO_SB_OPT, BCH_SB_SECTOR) \
|
||||
BCH_OPT(read_only, u8, OPT_INTERNAL, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(nostart, u8, OPT_INTERNAL, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false) \
|
||||
BCH_OPT(no_data_io, u8, OPT_MOUNT, \
|
||||
OPT_BOOL(), \
|
||||
NO_SB_OPT, false)
|
||||
|
||||
struct bch_opts {
|
||||
#define BCH_OPT(_name, _bits, ...) unsigned _name##_defined:1;
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
|
||||
#define BCH_OPT(_name, _bits, ...) _bits _name;
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
};
|
||||
|
||||
static const struct bch_opts bch2_opts_default = {
|
||||
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
|
||||
._name##_defined = true, \
|
||||
._name = _default, \
|
||||
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
};
|
||||
|
||||
#define opt_defined(_opts, _name) ((_opts)._name##_defined)
|
||||
|
||||
#define opt_get(_opts, _name) \
|
||||
(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
|
||||
|
||||
#define opt_set(_opts, _name, _v) \
|
||||
do { \
|
||||
(_opts)._name##_defined = true; \
|
||||
(_opts)._name = _v; \
|
||||
} while (0)
|
||||
|
||||
static inline struct bch_opts bch2_opts_empty(void)
|
||||
{
|
||||
return (struct bch_opts) { 0 };
|
||||
}
|
||||
|
||||
void bch2_opts_apply(struct bch_opts *, struct bch_opts);
|
||||
|
||||
enum bch_opt_id {
|
||||
#define BCH_OPT(_name, ...) Opt_##_name,
|
||||
BCH_OPTS()
|
||||
#undef BCH_OPT
|
||||
bch2_opts_nr
|
||||
};
|
||||
|
||||
struct bch_fs;
|
||||
|
||||
struct bch_option {
|
||||
struct attribute attr;
|
||||
void (*set_sb)(struct bch_sb *, u64);
|
||||
enum opt_mode mode;
|
||||
enum opt_type type;
|
||||
|
||||
union {
|
||||
struct {
|
||||
u64 min, max;
|
||||
};
|
||||
struct {
|
||||
const char * const *choices;
|
||||
};
|
||||
struct {
|
||||
int (*parse)(struct bch_fs *, const char *, u64 *);
|
||||
int (*print)(struct bch_fs *, char *, size_t, u64);
|
||||
};
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
extern const struct bch_option bch2_opt_table[];
|
||||
|
||||
bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
|
||||
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
|
||||
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
|
||||
|
||||
struct bch_opts bch2_opts_from_sb(struct bch_sb *);
|
||||
|
||||
int bch2_opt_lookup(const char *);
|
||||
int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
|
||||
|
||||
#define OPT_SHOW_FULL_LIST (1 << 0)
|
||||
#define OPT_SHOW_MOUNT_STYLE (1 << 1)
|
||||
|
||||
int bch2_opt_to_text(struct bch_fs *, char *, size_t,
|
||||
const struct bch_option *, u64, unsigned);
|
||||
|
||||
int bch2_parse_mount_opts(struct bch_opts *, char *);
|
||||
|
||||
/* inode opts: */
|
||||
|
||||
#define BCH_INODE_OPTS() \
|
||||
BCH_INODE_OPT(data_checksum, 8) \
|
||||
BCH_INODE_OPT(compression, 8) \
|
||||
BCH_INODE_OPT(background_compression, 8) \
|
||||
BCH_INODE_OPT(data_replicas, 8) \
|
||||
BCH_INODE_OPT(promote_target, 16) \
|
||||
BCH_INODE_OPT(foreground_target, 16) \
|
||||
BCH_INODE_OPT(background_target, 16)
|
||||
|
||||
struct bch_io_opts {
|
||||
#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
|
||||
#define BCH_INODE_OPT(_name, _bits) u##_bits _name;
|
||||
BCH_INODE_OPTS()
|
||||
#undef BCH_INODE_OPT
|
||||
};
|
||||
|
||||
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
|
||||
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
|
||||
void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
|
||||
bool bch2_opt_is_inode_opt(enum bch_opt_id);
|
||||
|
||||
#endif /* _BCACHEFS_OPTS_H */
|
790
fs/bcachefs/quota.c
Normal file
790
fs/bcachefs/quota.c
Normal file
@ -0,0 +1,790 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "btree_update.h"
|
||||
#include "inode.h"
|
||||
#include "quota.h"
|
||||
#include "super-io.h"
|
||||
|
||||
static const char *bch2_sb_validate_quota(struct bch_sb *sb,
|
||||
struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_quota *q = field_to_type(f, quota);
|
||||
|
||||
if (vstruct_bytes(&q->field) != sizeof(*q))
|
||||
return "invalid field quota: wrong size";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
|
||||
.validate = bch2_sb_validate_quota,
|
||||
};
|
||||
|
||||
const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_quota dq;
|
||||
|
||||
if (k.k->p.inode >= QTYP_NR)
|
||||
return "invalid quota type";
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_QUOTA: {
|
||||
dq = bkey_s_c_to_quota(k);
|
||||
|
||||
if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
|
||||
return "incorrect value size";
|
||||
|
||||
return NULL;
|
||||
}
|
||||
default:
|
||||
return "invalid type";
|
||||
}
|
||||
}
|
||||
|
||||
static const char * const bch2_quota_counters[] = {
|
||||
"space",
|
||||
"inodes",
|
||||
};
|
||||
|
||||
void bch2_quota_to_text(struct bch_fs *c, char *buf,
|
||||
size_t size, struct bkey_s_c k)
|
||||
{
|
||||
char *out = buf, *end= buf + size;
|
||||
struct bkey_s_c_quota dq;
|
||||
unsigned i;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_QUOTA:
|
||||
dq = bkey_s_c_to_quota(k);
|
||||
|
||||
for (i = 0; i < Q_COUNTERS; i++)
|
||||
out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
|
||||
bch2_quota_counters[i],
|
||||
le64_to_cpu(dq.v->c[i].hardlimit),
|
||||
le64_to_cpu(dq.v->c[i].softlimit));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_QUOTA
|
||||
|
||||
#include <linux/cred.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/quota.h>
|
||||
|
||||
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
|
||||
{
|
||||
qtypes >>= i;
|
||||
return qtypes ? i + __ffs(qtypes) : QTYP_NR;
|
||||
}
|
||||
|
||||
#define for_each_set_qtype(_c, _i, _q, _qtypes) \
|
||||
for (_i = 0; \
|
||||
(_i = __next_qtype(_i, _qtypes), \
|
||||
_q = &(_c)->quotas[_i], \
|
||||
_i < QTYP_NR); \
|
||||
_i++)
|
||||
|
||||
static bool ignore_hardlimit(struct bch_memquota_type *q)
|
||||
{
|
||||
if (capable(CAP_SYS_RESOURCE))
|
||||
return true;
|
||||
#if 0
|
||||
struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
|
||||
|
||||
return capable(CAP_SYS_RESOURCE) &&
|
||||
(info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
|
||||
!(info->dqi_flags & DQF_ROOT_SQUASH));
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
enum quota_msg {
|
||||
SOFTWARN, /* Softlimit reached */
|
||||
SOFTLONGWARN, /* Grace time expired */
|
||||
HARDWARN, /* Hardlimit reached */
|
||||
|
||||
HARDBELOW, /* Usage got below inode hardlimit */
|
||||
SOFTBELOW, /* Usage got below inode softlimit */
|
||||
};
|
||||
|
||||
static int quota_nl[][Q_COUNTERS] = {
|
||||
[HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
|
||||
[SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
|
||||
[SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
|
||||
[HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
|
||||
[SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
|
||||
|
||||
[HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
|
||||
[SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
|
||||
[SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
|
||||
[HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
|
||||
[SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
|
||||
};
|
||||
|
||||
struct quota_msgs {
|
||||
u8 nr;
|
||||
struct {
|
||||
u8 qtype;
|
||||
u8 msg;
|
||||
} m[QTYP_NR * Q_COUNTERS];
|
||||
};
|
||||
|
||||
static void prepare_msg(unsigned qtype,
|
||||
enum quota_counters counter,
|
||||
struct quota_msgs *msgs,
|
||||
enum quota_msg msg_type)
|
||||
{
|
||||
BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
|
||||
|
||||
msgs->m[msgs->nr].qtype = qtype;
|
||||
msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
|
||||
msgs->nr++;
|
||||
}
|
||||
|
||||
static void prepare_warning(struct memquota_counter *qc,
|
||||
unsigned qtype,
|
||||
enum quota_counters counter,
|
||||
struct quota_msgs *msgs,
|
||||
enum quota_msg msg_type)
|
||||
{
|
||||
if (qc->warning_issued & (1 << msg_type))
|
||||
return;
|
||||
|
||||
prepare_msg(qtype, counter, msgs, msg_type);
|
||||
}
|
||||
|
||||
static void flush_warnings(struct bch_qid qid,
|
||||
struct super_block *sb,
|
||||
struct quota_msgs *msgs)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < msgs->nr; i++)
|
||||
quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
|
||||
sb->s_dev, msgs->m[i].msg);
|
||||
}
|
||||
|
||||
static int bch2_quota_check_limit(struct bch_fs *c,
|
||||
unsigned qtype,
|
||||
struct bch_memquota *mq,
|
||||
struct quota_msgs *msgs,
|
||||
enum quota_counters counter,
|
||||
s64 v,
|
||||
enum quota_acct_mode mode)
|
||||
{
|
||||
struct bch_memquota_type *q = &c->quotas[qtype];
|
||||
struct memquota_counter *qc = &mq->c[counter];
|
||||
u64 n = qc->v + v;
|
||||
|
||||
BUG_ON((s64) n < 0);
|
||||
|
||||
if (mode == BCH_QUOTA_NOCHECK)
|
||||
return 0;
|
||||
|
||||
if (v <= 0) {
|
||||
if (n < qc->hardlimit &&
|
||||
(qc->warning_issued & (1 << HARDWARN))) {
|
||||
qc->warning_issued &= ~(1 << HARDWARN);
|
||||
prepare_msg(qtype, counter, msgs, HARDBELOW);
|
||||
}
|
||||
|
||||
if (n < qc->softlimit &&
|
||||
(qc->warning_issued & (1 << SOFTWARN))) {
|
||||
qc->warning_issued &= ~(1 << SOFTWARN);
|
||||
prepare_msg(qtype, counter, msgs, SOFTBELOW);
|
||||
}
|
||||
|
||||
qc->warning_issued = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (qc->hardlimit &&
|
||||
qc->hardlimit < n &&
|
||||
!ignore_hardlimit(q)) {
|
||||
if (mode == BCH_QUOTA_PREALLOC)
|
||||
return -EDQUOT;
|
||||
|
||||
prepare_warning(qc, qtype, counter, msgs, HARDWARN);
|
||||
}
|
||||
|
||||
if (qc->softlimit &&
|
||||
qc->softlimit < n &&
|
||||
qc->timer &&
|
||||
ktime_get_real_seconds() >= qc->timer &&
|
||||
!ignore_hardlimit(q)) {
|
||||
if (mode == BCH_QUOTA_PREALLOC)
|
||||
return -EDQUOT;
|
||||
|
||||
prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
|
||||
}
|
||||
|
||||
if (qc->softlimit &&
|
||||
qc->softlimit < n &&
|
||||
qc->timer == 0) {
|
||||
if (mode == BCH_QUOTA_PREALLOC)
|
||||
return -EDQUOT;
|
||||
|
||||
prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
|
||||
|
||||
/* XXX is this the right one? */
|
||||
qc->timer = ktime_get_real_seconds() +
|
||||
q->limits[counter].warnlimit;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
|
||||
enum quota_counters counter, s64 v,
|
||||
enum quota_acct_mode mode)
|
||||
{
|
||||
unsigned qtypes = enabled_qtypes(c);
|
||||
struct bch_memquota_type *q;
|
||||
struct bch_memquota *mq[QTYP_NR];
|
||||
struct quota_msgs msgs;
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
memset(&msgs, 0, sizeof(msgs));
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes)
|
||||
mutex_lock_nested(&q->lock, i);
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes) {
|
||||
mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
|
||||
if (!mq[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes)
|
||||
mq[i]->c[counter].v += v;
|
||||
err:
|
||||
for_each_set_qtype(c, i, q, qtypes)
|
||||
mutex_unlock(&q->lock);
|
||||
|
||||
flush_warnings(qid, c->vfs_sb, &msgs);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __bch2_quota_transfer(struct bch_memquota *src_q,
|
||||
struct bch_memquota *dst_q,
|
||||
enum quota_counters counter, s64 v)
|
||||
{
|
||||
BUG_ON(v > src_q->c[counter].v);
|
||||
BUG_ON(v + dst_q->c[counter].v < v);
|
||||
|
||||
src_q->c[counter].v -= v;
|
||||
dst_q->c[counter].v += v;
|
||||
}
|
||||
|
||||
int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
|
||||
struct bch_qid dst,
|
||||
struct bch_qid src, u64 space)
|
||||
{
|
||||
struct bch_memquota_type *q;
|
||||
struct bch_memquota *src_q[3], *dst_q[3];
|
||||
struct quota_msgs msgs;
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
qtypes &= enabled_qtypes(c);
|
||||
|
||||
memset(&msgs, 0, sizeof(msgs));
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes)
|
||||
mutex_lock_nested(&q->lock, i);
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes) {
|
||||
src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
|
||||
dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
|
||||
|
||||
if (!src_q[i] || !dst_q[i]) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
|
||||
dst_q[i]->c[Q_SPC].v + space,
|
||||
BCH_QUOTA_PREALLOC);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
|
||||
dst_q[i]->c[Q_INO].v + 1,
|
||||
BCH_QUOTA_PREALLOC);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes) {
|
||||
__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
|
||||
__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
|
||||
}
|
||||
|
||||
err:
|
||||
for_each_set_qtype(c, i, q, qtypes)
|
||||
mutex_unlock(&q->lock);
|
||||
|
||||
flush_warnings(dst, c->vfs_sb, &msgs);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_s_c_quota dq;
|
||||
struct bch_memquota_type *q;
|
||||
struct bch_memquota *mq;
|
||||
unsigned i;
|
||||
|
||||
BUG_ON(k.k->p.inode >= QTYP_NR);
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_QUOTA:
|
||||
dq = bkey_s_c_to_quota(k);
|
||||
q = &c->quotas[k.k->p.inode];
|
||||
|
||||
mutex_lock(&q->lock);
|
||||
mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
|
||||
if (!mq) {
|
||||
mutex_unlock(&q->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
for (i = 0; i < Q_COUNTERS; i++) {
|
||||
mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
|
||||
mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
|
||||
}
|
||||
|
||||
mutex_unlock(&q->lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
|
||||
BTREE_ITER_PREFETCH, k) {
|
||||
if (k.k->p.inode != type)
|
||||
break;
|
||||
|
||||
ret = __bch2_quota_set(c, k);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
|
||||
return bch2_btree_iter_unlock(&iter) ?: ret;
|
||||
}
|
||||
|
||||
void bch2_fs_quota_exit(struct bch_fs *c)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
|
||||
genradix_free(&c->quotas[i].table);
|
||||
}
|
||||
|
||||
void bch2_fs_quota_init(struct bch_fs *c)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
|
||||
mutex_init(&c->quotas[i].lock);
|
||||
}
|
||||
|
||||
static void bch2_sb_quota_read(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_quota *sb_quota;
|
||||
unsigned i, j;
|
||||
|
||||
sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
|
||||
if (!sb_quota)
|
||||
return;
|
||||
|
||||
for (i = 0; i < QTYP_NR; i++) {
|
||||
struct bch_memquota_type *q = &c->quotas[i];
|
||||
|
||||
for (j = 0; j < Q_COUNTERS; j++) {
|
||||
q->limits[j].timelimit =
|
||||
le32_to_cpu(sb_quota->q[i].c[j].timelimit);
|
||||
q->limits[j].warnlimit =
|
||||
le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_fs_quota_read(struct bch_fs *c)
|
||||
{
|
||||
unsigned i, qtypes = enabled_qtypes(c);
|
||||
struct bch_memquota_type *q;
|
||||
struct btree_iter iter;
|
||||
struct bch_inode_unpacked u;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
bch2_sb_quota_read(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
for_each_set_qtype(c, i, q, qtypes) {
|
||||
ret = bch2_quota_init_type(c, i);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
|
||||
BTREE_ITER_PREFETCH, k) {
|
||||
switch (k.k->type) {
|
||||
case BCH_INODE_FS:
|
||||
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
|
||||
BCH_QUOTA_NOCHECK);
|
||||
bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
|
||||
BCH_QUOTA_NOCHECK);
|
||||
}
|
||||
}
|
||||
return bch2_btree_iter_unlock(&iter) ?: ret;
|
||||
}
|
||||
|
||||
/* Enable/disable/delete quotas for an entire filesystem: */
|
||||
|
||||
static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
|
||||
if (sb->s_flags & SB_RDONLY)
|
||||
return -EROFS;
|
||||
|
||||
/* Accounting must be enabled at mount time: */
|
||||
if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
|
||||
return -EINVAL;
|
||||
|
||||
/* Can't enable enforcement without accounting: */
|
||||
if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
|
||||
return -EINVAL;
|
||||
|
||||
if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
|
||||
return -EINVAL;
|
||||
|
||||
if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
if (uflags & FS_QUOTA_UDQ_ENFD)
|
||||
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
|
||||
|
||||
if (uflags & FS_QUOTA_GDQ_ENFD)
|
||||
SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
|
||||
|
||||
if (uflags & FS_QUOTA_PDQ_ENFD)
|
||||
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
|
||||
if (sb->s_flags & SB_RDONLY)
|
||||
return -EROFS;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
if (uflags & FS_QUOTA_UDQ_ENFD)
|
||||
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
|
||||
|
||||
if (uflags & FS_QUOTA_GDQ_ENFD)
|
||||
SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
|
||||
|
||||
if (uflags & FS_QUOTA_PDQ_ENFD)
|
||||
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
int ret;
|
||||
|
||||
if (sb->s_flags & SB_RDONLY)
|
||||
return -EROFS;
|
||||
|
||||
if (uflags & FS_USER_QUOTA) {
|
||||
if (c->opts.usrquota)
|
||||
return -EINVAL;
|
||||
|
||||
ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
|
||||
POS(QTYP_USR, 0),
|
||||
POS(QTYP_USR + 1, 0),
|
||||
ZERO_VERSION, NULL, NULL, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (uflags & FS_GROUP_QUOTA) {
|
||||
if (c->opts.grpquota)
|
||||
return -EINVAL;
|
||||
|
||||
ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
|
||||
POS(QTYP_GRP, 0),
|
||||
POS(QTYP_GRP + 1, 0),
|
||||
ZERO_VERSION, NULL, NULL, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (uflags & FS_PROJ_QUOTA) {
|
||||
if (c->opts.prjquota)
|
||||
return -EINVAL;
|
||||
|
||||
ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
|
||||
POS(QTYP_PRJ, 0),
|
||||
POS(QTYP_PRJ + 1, 0),
|
||||
ZERO_VERSION, NULL, NULL, NULL);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return quota status information, such as enforcements, quota file inode
|
||||
* numbers etc.
|
||||
*/
|
||||
static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
unsigned qtypes = enabled_qtypes(c);
|
||||
unsigned i;
|
||||
|
||||
memset(state, 0, sizeof(*state));
|
||||
|
||||
for (i = 0; i < QTYP_NR; i++) {
|
||||
state->s_state[i].flags |= QCI_SYSFILE;
|
||||
|
||||
if (!(qtypes & (1 << i)))
|
||||
continue;
|
||||
|
||||
state->s_state[i].flags |= QCI_ACCT_ENABLED;
|
||||
|
||||
state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
|
||||
state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
|
||||
|
||||
state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
|
||||
state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Adjust quota timers & warnings
|
||||
*/
|
||||
static int bch2_quota_set_info(struct super_block *sb, int type,
|
||||
struct qc_info *info)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
struct bch_sb_field_quota *sb_quota;
|
||||
struct bch_memquota_type *q;
|
||||
|
||||
if (sb->s_flags & SB_RDONLY)
|
||||
return -EROFS;
|
||||
|
||||
if (type >= QTYP_NR)
|
||||
return -EINVAL;
|
||||
|
||||
if (!((1 << type) & enabled_qtypes(c)))
|
||||
return -ESRCH;
|
||||
|
||||
if (info->i_fieldmask &
|
||||
~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
|
||||
return -EINVAL;
|
||||
|
||||
q = &c->quotas[type];
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
|
||||
if (!sb_quota) {
|
||||
sb_quota = bch2_sb_resize_quota(&c->disk_sb,
|
||||
sizeof(*sb_quota) / sizeof(u64));
|
||||
if (!sb_quota)
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
if (info->i_fieldmask & QC_SPC_TIMER)
|
||||
sb_quota->q[type].c[Q_SPC].timelimit =
|
||||
cpu_to_le32(info->i_spc_timelimit);
|
||||
|
||||
if (info->i_fieldmask & QC_SPC_WARNS)
|
||||
sb_quota->q[type].c[Q_SPC].warnlimit =
|
||||
cpu_to_le32(info->i_spc_warnlimit);
|
||||
|
||||
if (info->i_fieldmask & QC_INO_TIMER)
|
||||
sb_quota->q[type].c[Q_INO].timelimit =
|
||||
cpu_to_le32(info->i_ino_timelimit);
|
||||
|
||||
if (info->i_fieldmask & QC_INO_WARNS)
|
||||
sb_quota->q[type].c[Q_INO].warnlimit =
|
||||
cpu_to_le32(info->i_ino_warnlimit);
|
||||
|
||||
bch2_sb_quota_read(c);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Get/set individual quotas: */
|
||||
|
||||
static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
|
||||
{
|
||||
dst->d_space = src->c[Q_SPC].v << 9;
|
||||
dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
|
||||
dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
|
||||
dst->d_spc_timer = src->c[Q_SPC].timer;
|
||||
dst->d_spc_warns = src->c[Q_SPC].warns;
|
||||
|
||||
dst->d_ino_count = src->c[Q_INO].v;
|
||||
dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
|
||||
dst->d_ino_softlimit = src->c[Q_INO].softlimit;
|
||||
dst->d_ino_timer = src->c[Q_INO].timer;
|
||||
dst->d_ino_warns = src->c[Q_INO].warns;
|
||||
}
|
||||
|
||||
static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
|
||||
struct qc_dqblk *qdq)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
struct bch_memquota_type *q = &c->quotas[kqid.type];
|
||||
qid_t qid = from_kqid(&init_user_ns, kqid);
|
||||
struct bch_memquota *mq;
|
||||
|
||||
memset(qdq, 0, sizeof(*qdq));
|
||||
|
||||
mutex_lock(&q->lock);
|
||||
mq = genradix_ptr(&q->table, qid);
|
||||
if (mq)
|
||||
__bch2_quota_get(qdq, mq);
|
||||
mutex_unlock(&q->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
|
||||
struct qc_dqblk *qdq)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
struct bch_memquota_type *q = &c->quotas[kqid->type];
|
||||
qid_t qid = from_kqid(&init_user_ns, *kqid);
|
||||
struct genradix_iter iter = genradix_iter_init(&q->table, qid);
|
||||
struct bch_memquota *mq;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&q->lock);
|
||||
|
||||
while ((mq = genradix_iter_peek(&iter, &q->table))) {
|
||||
if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
|
||||
__bch2_quota_get(qdq, mq);
|
||||
*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
|
||||
goto found;
|
||||
}
|
||||
|
||||
genradix_iter_advance(&iter, &q->table);
|
||||
}
|
||||
|
||||
ret = -ENOENT;
|
||||
found:
|
||||
mutex_unlock(&q->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_set_quota(struct super_block *sb, struct kqid qid,
|
||||
struct qc_dqblk *qdq)
|
||||
{
|
||||
struct bch_fs *c = sb->s_fs_info;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
struct bkey_i_quota new_quota;
|
||||
int ret;
|
||||
|
||||
if (sb->s_flags & SB_RDONLY)
|
||||
return -EROFS;
|
||||
|
||||
bkey_quota_init(&new_quota.k_i);
|
||||
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
|
||||
|
||||
bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
|
||||
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
||||
k = bch2_btree_iter_peek_slot(&iter);
|
||||
|
||||
ret = btree_iter_err(k);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
switch (k.k->type) {
|
||||
case BCH_QUOTA:
|
||||
new_quota.v = *bkey_s_c_to_quota(k).v;
|
||||
break;
|
||||
}
|
||||
|
||||
if (qdq->d_fieldmask & QC_SPC_SOFT)
|
||||
new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
|
||||
if (qdq->d_fieldmask & QC_SPC_HARD)
|
||||
new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
|
||||
|
||||
if (qdq->d_fieldmask & QC_INO_SOFT)
|
||||
new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
|
||||
if (qdq->d_fieldmask & QC_INO_HARD)
|
||||
new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
|
||||
|
||||
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
|
||||
BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
|
||||
bch2_btree_iter_unlock(&iter);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
const struct quotactl_ops bch2_quotactl_operations = {
|
||||
.quota_enable = bch2_quota_enable,
|
||||
.quota_disable = bch2_quota_disable,
|
||||
.rm_xquota = bch2_quota_remove,
|
||||
|
||||
.get_state = bch2_quota_get_state,
|
||||
.set_info = bch2_quota_set_info,
|
||||
|
||||
.get_dqblk = bch2_get_quota,
|
||||
.get_nextdqblk = bch2_get_next_quota,
|
||||
.set_dqblk = bch2_set_quota,
|
||||
};
|
||||
|
||||
#endif /* CONFIG_BCACHEFS_QUOTA */
|
76
fs/bcachefs/quota.h
Normal file
76
fs/bcachefs/quota.h
Normal file
@ -0,0 +1,76 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_QUOTA_H
|
||||
#define _BCACHEFS_QUOTA_H
|
||||
|
||||
#include "inode.h"
|
||||
#include "quota_types.h"
|
||||
|
||||
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
|
||||
|
||||
const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
|
||||
|
||||
#define bch2_bkey_quota_ops (struct bkey_ops) { \
|
||||
.key_invalid = bch2_quota_invalid, \
|
||||
.val_to_text = bch2_quota_to_text, \
|
||||
}
|
||||
|
||||
enum quota_acct_mode {
|
||||
BCH_QUOTA_PREALLOC,
|
||||
BCH_QUOTA_WARN,
|
||||
BCH_QUOTA_NOCHECK,
|
||||
};
|
||||
|
||||
static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
|
||||
{
|
||||
return (struct bch_qid) {
|
||||
.q[QTYP_USR] = u->bi_uid,
|
||||
.q[QTYP_GRP] = u->bi_gid,
|
||||
.q[QTYP_PRJ] = u->bi_project,
|
||||
};
|
||||
}
|
||||
|
||||
static inline unsigned enabled_qtypes(struct bch_fs *c)
|
||||
{
|
||||
return ((c->opts.usrquota << QTYP_USR)|
|
||||
(c->opts.grpquota << QTYP_GRP)|
|
||||
(c->opts.prjquota << QTYP_PRJ));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_QUOTA
|
||||
|
||||
int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
|
||||
s64, enum quota_acct_mode);
|
||||
|
||||
int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
|
||||
struct bch_qid, u64);
|
||||
|
||||
void bch2_fs_quota_exit(struct bch_fs *);
|
||||
void bch2_fs_quota_init(struct bch_fs *);
|
||||
int bch2_fs_quota_read(struct bch_fs *);
|
||||
|
||||
extern const struct quotactl_ops bch2_quotactl_operations;
|
||||
|
||||
#else
|
||||
|
||||
static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
|
||||
enum quota_counters counter, s64 v,
|
||||
enum quota_acct_mode mode)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
|
||||
struct bch_qid dst,
|
||||
struct bch_qid src, u64 space)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
|
||||
static inline void bch2_fs_quota_init(struct bch_fs *c) {}
|
||||
static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* _BCACHEFS_QUOTA_H */
|
37
fs/bcachefs/quota_types.h
Normal file
37
fs/bcachefs/quota_types.h
Normal file
@ -0,0 +1,37 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_QUOTA_TYPES_H
|
||||
#define _BCACHEFS_QUOTA_TYPES_H
|
||||
|
||||
#include <linux/generic-radix-tree.h>
|
||||
|
||||
struct bch_qid {
|
||||
u32 q[QTYP_NR];
|
||||
};
|
||||
|
||||
struct memquota_counter {
|
||||
u64 v;
|
||||
u64 hardlimit;
|
||||
u64 softlimit;
|
||||
s64 timer;
|
||||
int warns;
|
||||
int warning_issued;
|
||||
};
|
||||
|
||||
struct bch_memquota {
|
||||
struct memquota_counter c[Q_COUNTERS];
|
||||
};
|
||||
|
||||
typedef GENRADIX(struct bch_memquota) bch_memquota_table;
|
||||
|
||||
struct quota_limit {
|
||||
u32 timelimit;
|
||||
u32 warnlimit;
|
||||
};
|
||||
|
||||
struct bch_memquota_type {
|
||||
struct quota_limit limits[Q_COUNTERS];
|
||||
bch_memquota_table table;
|
||||
struct mutex lock;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_QUOTA_TYPES_H */
|
342
fs/bcachefs/rebalance.c
Normal file
342
fs/bcachefs/rebalance.c
Normal file
@ -0,0 +1,342 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc.h"
|
||||
#include "btree_iter.h"
|
||||
#include "buckets.h"
|
||||
#include "clock.h"
|
||||
#include "disk_groups.h"
|
||||
#include "extents.h"
|
||||
#include "io.h"
|
||||
#include "move.h"
|
||||
#include "rebalance.h"
|
||||
#include "super-io.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/sched/cputime.h>
|
||||
|
||||
static inline bool rebalance_ptr_pred(struct bch_fs *c,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
struct bch_extent_crc_unpacked crc,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
if (io_opts->background_target &&
|
||||
!bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
|
||||
!ptr->cached)
|
||||
return true;
|
||||
|
||||
if (io_opts->background_compression &&
|
||||
crc.compression_type !=
|
||||
bch2_compression_opt_to_type[io_opts->background_compression])
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_key(struct bch_fs *c,
|
||||
struct bkey_s_c k,
|
||||
struct bch_io_opts *io_opts)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
struct bkey_s_c_extent e;
|
||||
|
||||
if (!bkey_extent_is_data(k.k))
|
||||
return;
|
||||
|
||||
if (!io_opts->background_target &&
|
||||
!io_opts->background_compression)
|
||||
return;
|
||||
|
||||
e = bkey_s_c_to_extent(k);
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
||||
|
||||
if (atomic64_add_return(crc.compressed_size,
|
||||
&ca->rebalance_work) ==
|
||||
crc.compressed_size)
|
||||
rebalance_wakeup(c);
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
|
||||
{
|
||||
if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
|
||||
sectors)
|
||||
rebalance_wakeup(c);
|
||||
}
|
||||
|
||||
static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
|
||||
enum bkey_type type,
|
||||
struct bkey_s_c_extent e,
|
||||
struct bch_io_opts *io_opts,
|
||||
struct data_opts *data_opts)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
struct bch_extent_crc_unpacked crc;
|
||||
|
||||
/* Make sure we have room to add a new pointer: */
|
||||
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
|
||||
BKEY_EXTENT_VAL_U64s_MAX)
|
||||
return DATA_SKIP;
|
||||
|
||||
extent_for_each_ptr_crc(e, ptr, crc)
|
||||
if (rebalance_ptr_pred(c, ptr, crc, io_opts))
|
||||
goto found;
|
||||
|
||||
return DATA_SKIP;
|
||||
found:
|
||||
data_opts->target = io_opts->background_target;
|
||||
data_opts->btree_insert_flags = 0;
|
||||
return DATA_ADD_REPLICAS;
|
||||
}
|
||||
|
||||
struct rebalance_work {
|
||||
int dev_most_full_idx;
|
||||
unsigned dev_most_full_percent;
|
||||
u64 dev_most_full_work;
|
||||
u64 dev_most_full_capacity;
|
||||
u64 total_work;
|
||||
};
|
||||
|
||||
static void rebalance_work_accumulate(struct rebalance_work *w,
|
||||
u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
|
||||
{
|
||||
unsigned percent_full;
|
||||
u64 work = dev_work + unknown_dev;
|
||||
|
||||
if (work < dev_work || work < unknown_dev)
|
||||
work = U64_MAX;
|
||||
work = min(work, capacity);
|
||||
|
||||
percent_full = div_u64(work * 100, capacity);
|
||||
|
||||
if (percent_full >= w->dev_most_full_percent) {
|
||||
w->dev_most_full_idx = idx;
|
||||
w->dev_most_full_percent = percent_full;
|
||||
w->dev_most_full_work = work;
|
||||
w->dev_most_full_capacity = capacity;
|
||||
}
|
||||
|
||||
if (w->total_work + dev_work >= w->total_work &&
|
||||
w->total_work + dev_work >= dev_work)
|
||||
w->total_work += dev_work;
|
||||
}
|
||||
|
||||
static struct rebalance_work rebalance_work(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
struct rebalance_work ret = { .dev_most_full_idx = -1 };
|
||||
u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
rebalance_work_accumulate(&ret,
|
||||
atomic64_read(&ca->rebalance_work),
|
||||
unknown_dev,
|
||||
bucket_to_sector(ca, ca->mi.nbuckets -
|
||||
ca->mi.first_bucket),
|
||||
i);
|
||||
|
||||
rebalance_work_accumulate(&ret,
|
||||
unknown_dev, 0, c->capacity, -1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void rebalance_work_reset(struct bch_fs *c)
|
||||
{
|
||||
struct bch_dev *ca;
|
||||
unsigned i;
|
||||
|
||||
for_each_online_member(ca, c, i)
|
||||
atomic64_set(&ca->rebalance_work, 0);
|
||||
|
||||
atomic64_set(&c->rebalance.work_unknown_dev, 0);
|
||||
}
|
||||
|
||||
static unsigned long curr_cputime(void)
|
||||
{
|
||||
u64 utime, stime;
|
||||
|
||||
task_cputime_adjusted(current, &utime, &stime);
|
||||
return nsecs_to_jiffies(utime + stime);
|
||||
}
|
||||
|
||||
static int bch2_rebalance_thread(void *arg)
|
||||
{
|
||||
struct bch_fs *c = arg;
|
||||
struct bch_fs_rebalance *r = &c->rebalance;
|
||||
struct io_clock *clock = &c->io_clock[WRITE];
|
||||
struct rebalance_work w, p;
|
||||
unsigned long start, prev_start;
|
||||
unsigned long prev_run_time, prev_run_cputime;
|
||||
unsigned long cputime, prev_cputime;
|
||||
unsigned long io_start;
|
||||
long throttle;
|
||||
|
||||
set_freezable();
|
||||
|
||||
io_start = atomic_long_read(&clock->now);
|
||||
p = rebalance_work(c);
|
||||
prev_start = jiffies;
|
||||
prev_cputime = curr_cputime();
|
||||
|
||||
while (!kthread_wait_freezable(r->enabled)) {
|
||||
start = jiffies;
|
||||
cputime = curr_cputime();
|
||||
|
||||
prev_run_time = start - prev_start;
|
||||
prev_run_cputime = cputime - prev_cputime;
|
||||
|
||||
w = rebalance_work(c);
|
||||
BUG_ON(!w.dev_most_full_capacity);
|
||||
|
||||
if (!w.total_work) {
|
||||
r->state = REBALANCE_WAITING;
|
||||
kthread_wait_freezable(rebalance_work(c).total_work);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there isn't much work to do, throttle cpu usage:
|
||||
*/
|
||||
throttle = prev_run_cputime * 100 /
|
||||
max(1U, w.dev_most_full_percent) -
|
||||
prev_run_time;
|
||||
|
||||
if (w.dev_most_full_percent < 20 && throttle > 0) {
|
||||
r->state = REBALANCE_THROTTLED;
|
||||
r->throttled_until_iotime = io_start +
|
||||
div_u64(w.dev_most_full_capacity *
|
||||
(20 - w.dev_most_full_percent),
|
||||
50);
|
||||
r->throttled_until_cputime = start + throttle;
|
||||
|
||||
bch2_kthread_io_clock_wait(clock,
|
||||
r->throttled_until_iotime,
|
||||
throttle);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* minimum 1 mb/sec: */
|
||||
r->pd.rate.rate =
|
||||
max_t(u64, 1 << 11,
|
||||
r->pd.rate.rate *
|
||||
max(p.dev_most_full_percent, 1U) /
|
||||
max(w.dev_most_full_percent, 1U));
|
||||
|
||||
io_start = atomic_long_read(&clock->now);
|
||||
p = w;
|
||||
prev_start = start;
|
||||
prev_cputime = cputime;
|
||||
|
||||
r->state = REBALANCE_RUNNING;
|
||||
memset(&r->move_stats, 0, sizeof(r->move_stats));
|
||||
rebalance_work_reset(c);
|
||||
|
||||
bch2_move_data(c,
|
||||
/* ratelimiting disabled for now */
|
||||
NULL, /* &r->pd.rate, */
|
||||
writepoint_ptr(&c->rebalance_write_point),
|
||||
POS_MIN, POS_MAX,
|
||||
rebalance_pred, NULL,
|
||||
&r->move_stats);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
|
||||
{
|
||||
char *out = buf, *end = out + PAGE_SIZE;
|
||||
struct bch_fs_rebalance *r = &c->rebalance;
|
||||
struct rebalance_work w = rebalance_work(c);
|
||||
char h1[21], h2[21];
|
||||
|
||||
bch2_hprint(h1, w.dev_most_full_work << 9);
|
||||
bch2_hprint(h2, w.dev_most_full_capacity << 9);
|
||||
out += scnprintf(out, end - out,
|
||||
"fullest_dev (%i):\t%s/%s\n",
|
||||
w.dev_most_full_idx, h1, h2);
|
||||
|
||||
bch2_hprint(h1, w.total_work << 9);
|
||||
bch2_hprint(h2, c->capacity << 9);
|
||||
out += scnprintf(out, end - out,
|
||||
"total work:\t\t%s/%s\n",
|
||||
h1, h2);
|
||||
|
||||
out += scnprintf(out, end - out,
|
||||
"rate:\t\t\t%u\n",
|
||||
r->pd.rate.rate);
|
||||
|
||||
switch (r->state) {
|
||||
case REBALANCE_WAITING:
|
||||
out += scnprintf(out, end - out, "waiting\n");
|
||||
break;
|
||||
case REBALANCE_THROTTLED:
|
||||
bch2_hprint(h1,
|
||||
(r->throttled_until_iotime -
|
||||
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
|
||||
out += scnprintf(out, end - out,
|
||||
"throttled for %lu sec or %s io\n",
|
||||
(r->throttled_until_cputime - jiffies) / HZ,
|
||||
h1);
|
||||
break;
|
||||
case REBALANCE_RUNNING:
|
||||
out += scnprintf(out, end - out, "running\n");
|
||||
out += scnprintf(out, end - out, "pos %llu:%llu\n",
|
||||
r->move_stats.iter.pos.inode,
|
||||
r->move_stats.iter.pos.offset);
|
||||
break;
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
void bch2_rebalance_stop(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
c->rebalance.pd.rate.rate = UINT_MAX;
|
||||
bch2_ratelimit_reset(&c->rebalance.pd.rate);
|
||||
|
||||
p = rcu_dereference_protected(c->rebalance.thread, 1);
|
||||
c->rebalance.thread = NULL;
|
||||
|
||||
if (p) {
|
||||
/* for sychronizing with rebalance_wakeup() */
|
||||
synchronize_rcu();
|
||||
|
||||
kthread_stop(p);
|
||||
put_task_struct(p);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_rebalance_start(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
if (c->opts.nochanges)
|
||||
return 0;
|
||||
|
||||
p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
|
||||
if (IS_ERR(p))
|
||||
return PTR_ERR(p);
|
||||
|
||||
get_task_struct(p);
|
||||
rcu_assign_pointer(c->rebalance.thread, p);
|
||||
wake_up_process(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_fs_rebalance_init(struct bch_fs *c)
|
||||
{
|
||||
bch2_pd_controller_init(&c->rebalance.pd);
|
||||
|
||||
atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
|
||||
}
|
28
fs/bcachefs/rebalance.h
Normal file
28
fs/bcachefs/rebalance.h
Normal file
@ -0,0 +1,28 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_REBALANCE_H
|
||||
#define _BCACHEFS_REBALANCE_H
|
||||
|
||||
#include "rebalance_types.h"
|
||||
|
||||
static inline void rebalance_wakeup(struct bch_fs *c)
|
||||
{
|
||||
struct task_struct *p;
|
||||
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(c->rebalance.thread);
|
||||
if (p)
|
||||
wake_up_process(p);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
|
||||
struct bch_io_opts *);
|
||||
void bch2_rebalance_add_work(struct bch_fs *, u64);
|
||||
|
||||
ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
|
||||
|
||||
void bch2_rebalance_stop(struct bch_fs *);
|
||||
int bch2_rebalance_start(struct bch_fs *);
|
||||
void bch2_fs_rebalance_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_REBALANCE_H */
|
27
fs/bcachefs/rebalance_types.h
Normal file
27
fs/bcachefs/rebalance_types.h
Normal file
@ -0,0 +1,27 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_REBALANCE_TYPES_H
|
||||
#define _BCACHEFS_REBALANCE_TYPES_H
|
||||
|
||||
#include "move_types.h"
|
||||
|
||||
enum rebalance_state {
|
||||
REBALANCE_WAITING,
|
||||
REBALANCE_THROTTLED,
|
||||
REBALANCE_RUNNING,
|
||||
};
|
||||
|
||||
struct bch_fs_rebalance {
|
||||
struct task_struct __rcu *thread;
|
||||
struct bch_pd_controller pd;
|
||||
|
||||
atomic64_t work_unknown_dev;
|
||||
|
||||
enum rebalance_state state;
|
||||
unsigned long throttled_until_iotime;
|
||||
unsigned long throttled_until_cputime;
|
||||
struct bch_move_stats move_stats;
|
||||
|
||||
unsigned enabled:1;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
|
377
fs/bcachefs/recovery.c
Normal file
377
fs/bcachefs/recovery.c
Normal file
@ -0,0 +1,377 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "alloc.h"
|
||||
#include "btree_gc.h"
|
||||
#include "btree_update.h"
|
||||
#include "btree_update_interior.h"
|
||||
#include "btree_io.h"
|
||||
#include "dirent.h"
|
||||
#include "error.h"
|
||||
#include "fsck.h"
|
||||
#include "journal_io.h"
|
||||
#include "quota.h"
|
||||
#include "recovery.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/stat.h>
|
||||
|
||||
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
|
||||
|
||||
struct bkey_i *btree_root_find(struct bch_fs *c,
|
||||
struct bch_sb_field_clean *clean,
|
||||
struct jset *j,
|
||||
enum btree_id id, unsigned *level)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
struct jset_entry *entry, *start, *end;
|
||||
|
||||
if (clean) {
|
||||
start = clean->start;
|
||||
end = vstruct_end(&clean->field);
|
||||
} else {
|
||||
start = j->start;
|
||||
end = vstruct_last(j);
|
||||
}
|
||||
|
||||
for (entry = start; entry < end; entry = vstruct_next(entry))
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_root &&
|
||||
entry->btree_id == id)
|
||||
goto found;
|
||||
|
||||
return NULL;
|
||||
found:
|
||||
if (!entry->u64s)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
k = entry->start;
|
||||
*level = entry->level;
|
||||
return k;
|
||||
}
|
||||
|
||||
static int verify_superblock_clean(struct bch_fs *c,
|
||||
struct bch_sb_field_clean *clean,
|
||||
struct jset *j)
|
||||
{
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
if (!clean || !j)
|
||||
return 0;
|
||||
|
||||
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
|
||||
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
|
||||
le64_to_cpu(clean->journal_seq),
|
||||
le64_to_cpu(j->seq)))
|
||||
bch2_fs_mark_clean(c, false);
|
||||
|
||||
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
|
||||
"superblock read clock doesn't match journal after clean shutdown");
|
||||
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
|
||||
"superblock read clock doesn't match journal after clean shutdown");
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
struct bkey_i *k1, *k2;
|
||||
unsigned l1 = 0, l2 = 0;
|
||||
|
||||
k1 = btree_root_find(c, clean, NULL, i, &l1);
|
||||
k2 = btree_root_find(c, NULL, j, i, &l2);
|
||||
|
||||
if (!k1 && !k2)
|
||||
continue;
|
||||
|
||||
mustfix_fsck_err_on(!k1 || !k2 ||
|
||||
IS_ERR(k1) ||
|
||||
IS_ERR(k2) ||
|
||||
k1->k.u64s != k2->k.u64s ||
|
||||
memcmp(k1, k2, bkey_bytes(k1)) ||
|
||||
l1 != l2, c,
|
||||
"superblock btree root doesn't match journal after clean shutdown");
|
||||
}
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool journal_empty(struct list_head *journal)
|
||||
{
|
||||
struct journal_replay *i;
|
||||
struct jset_entry *entry;
|
||||
|
||||
if (list_empty(journal))
|
||||
return true;
|
||||
|
||||
i = list_last_entry(journal, struct journal_replay, list);
|
||||
|
||||
if (i->j.last_seq != i->j.seq)
|
||||
return false;
|
||||
|
||||
list_for_each_entry(i, journal, list) {
|
||||
vstruct_for_each(&i->j, entry) {
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_root)
|
||||
continue;
|
||||
|
||||
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
|
||||
!entry->u64s)
|
||||
continue;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int bch2_fs_recovery(struct bch_fs *c)
|
||||
{
|
||||
const char *err = "cannot allocate memory";
|
||||
struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
|
||||
LIST_HEAD(journal);
|
||||
struct jset *j = NULL;
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
|
||||
bch_info(c, "building replicas info");
|
||||
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
|
||||
}
|
||||
|
||||
if (c->sb.clean)
|
||||
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
|
||||
if (sb_clean) {
|
||||
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
|
||||
GFP_KERNEL);
|
||||
if (!clean) {
|
||||
ret = -ENOMEM;
|
||||
mutex_unlock(&c->sb_lock);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
if (clean)
|
||||
bch_info(c, "recovering from clean shutdown, journal seq %llu",
|
||||
le64_to_cpu(clean->journal_seq));
|
||||
|
||||
if (!clean || !c->opts.nofsck) {
|
||||
ret = bch2_journal_read(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
j = &list_entry(journal.prev, struct journal_replay, list)->j;
|
||||
} else {
|
||||
ret = bch2_journal_set_seq(c,
|
||||
le64_to_cpu(clean->journal_seq),
|
||||
le64_to_cpu(clean->journal_seq));
|
||||
BUG_ON(ret);
|
||||
}
|
||||
|
||||
ret = verify_superblock_clean(c, clean, j);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
fsck_err_on(clean && !journal_empty(&journal), c,
|
||||
"filesystem marked clean but journal not empty");
|
||||
|
||||
if (clean) {
|
||||
c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
|
||||
c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
|
||||
} else {
|
||||
c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
|
||||
c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
|
||||
}
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++) {
|
||||
unsigned level;
|
||||
struct bkey_i *k;
|
||||
|
||||
k = btree_root_find(c, clean, j, i, &level);
|
||||
if (!k)
|
||||
continue;
|
||||
|
||||
err = "invalid btree root pointer";
|
||||
if (IS_ERR(k))
|
||||
goto err;
|
||||
|
||||
err = "error reading btree root";
|
||||
if (bch2_btree_root_read(c, i, k, level)) {
|
||||
if (i != BTREE_ID_ALLOC)
|
||||
goto err;
|
||||
|
||||
mustfix_fsck_err(c, "error reading btree root");
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
if (!c->btree_roots[i].b)
|
||||
bch2_btree_root_alloc(c, i);
|
||||
|
||||
err = "error reading allocation information";
|
||||
ret = bch2_alloc_read(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
|
||||
|
||||
bch_verbose(c, "starting mark and sweep:");
|
||||
err = "error in recovery";
|
||||
ret = bch2_initial_gc(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
bch_verbose(c, "mark and sweep done");
|
||||
|
||||
if (c->opts.noreplay)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Mark dirty before journal replay, fsck:
|
||||
* XXX: after a clean shutdown, this could be done lazily only when fsck
|
||||
* finds an error
|
||||
*/
|
||||
bch2_fs_mark_clean(c, false);
|
||||
|
||||
/*
|
||||
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
|
||||
* will give spurious errors about oldest_gen > bucket_gen -
|
||||
* this is a hack but oh well.
|
||||
*/
|
||||
bch2_fs_journal_start(&c->journal);
|
||||
|
||||
err = "error starting allocator";
|
||||
ret = bch2_fs_allocator_start(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch_verbose(c, "starting journal replay:");
|
||||
err = "journal replay failed";
|
||||
ret = bch2_journal_replay(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
bch_verbose(c, "journal replay done");
|
||||
|
||||
if (c->opts.norecovery)
|
||||
goto out;
|
||||
|
||||
err = "error in fsck";
|
||||
ret = bch2_fsck(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (enabled_qtypes(c)) {
|
||||
bch_verbose(c, "reading quotas:");
|
||||
ret = bch2_fs_quota_read(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
bch_verbose(c, "quotas done");
|
||||
}
|
||||
|
||||
out:
|
||||
bch2_journal_entries_free(&journal);
|
||||
kfree(clean);
|
||||
return ret;
|
||||
err:
|
||||
fsck_err:
|
||||
BUG_ON(!ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_fs_initialize(struct bch_fs *c)
|
||||
{
|
||||
struct bch_inode_unpacked root_inode, lostfound_inode;
|
||||
struct bkey_inode_buf packed_inode;
|
||||
struct bch_hash_info root_hash_info;
|
||||
struct qstr lostfound = QSTR("lost+found");
|
||||
const char *err = "cannot allocate memory";
|
||||
struct bch_dev *ca;
|
||||
LIST_HEAD(journal);
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
bch_notice(c, "initializing new filesystem");
|
||||
|
||||
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
|
||||
|
||||
ret = bch2_initial_gc(c, &journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
err = "unable to allocate journal buckets";
|
||||
for_each_online_member(ca, c, i)
|
||||
if (bch2_dev_journal_alloc(ca)) {
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
goto err;
|
||||
}
|
||||
|
||||
for (i = 0; i < BTREE_ID_NR; i++)
|
||||
bch2_btree_root_alloc(c, i);
|
||||
|
||||
/*
|
||||
* journal_res_get() will crash if called before this has
|
||||
* set up the journal.pin FIFO and journal.cur pointer:
|
||||
*/
|
||||
bch2_fs_journal_start(&c->journal);
|
||||
bch2_journal_set_replay_done(&c->journal);
|
||||
|
||||
err = "error starting allocator";
|
||||
ret = bch2_fs_allocator_start(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_inode_init(c, &root_inode, 0, 0,
|
||||
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
|
||||
root_inode.bi_inum = BCACHEFS_ROOT_INO;
|
||||
root_inode.bi_nlink++; /* lost+found */
|
||||
bch2_inode_pack(&packed_inode, &root_inode);
|
||||
|
||||
err = "error creating root directory";
|
||||
ret = bch2_btree_insert(c, BTREE_ID_INODES,
|
||||
&packed_inode.inode.k_i,
|
||||
NULL, NULL, NULL, 0);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_inode_init(c, &lostfound_inode, 0, 0,
|
||||
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
|
||||
&root_inode);
|
||||
lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
|
||||
bch2_inode_pack(&packed_inode, &lostfound_inode);
|
||||
|
||||
err = "error creating lost+found";
|
||||
ret = bch2_btree_insert(c, BTREE_ID_INODES,
|
||||
&packed_inode.inode.k_i,
|
||||
NULL, NULL, NULL, 0);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
root_hash_info = bch2_hash_info_init(c, &root_inode);
|
||||
|
||||
ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
|
||||
&lostfound, lostfound_inode.bi_inum, NULL,
|
||||
BTREE_INSERT_NOFAIL);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
atomic_long_set(&c->nr_inodes, 2);
|
||||
|
||||
if (enabled_qtypes(c)) {
|
||||
ret = bch2_fs_quota_read(c);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = "error writing first journal entry";
|
||||
ret = bch2_journal_meta(&c->journal);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
|
||||
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
||||
|
||||
bch2_write_super(c);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
BUG_ON(!ret);
|
||||
return ret;
|
||||
}
|
8
fs/bcachefs/recovery.h
Normal file
8
fs/bcachefs/recovery.h
Normal file
@ -0,0 +1,8 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_RECOVERY_H
|
||||
#define _BCACHEFS_RECOVERY_H
|
||||
|
||||
int bch2_fs_recovery(struct bch_fs *);
|
||||
int bch2_fs_initialize(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_RECOVERY_H */
|
698
fs/bcachefs/replicas.c
Normal file
698
fs/bcachefs/replicas.c
Normal file
@ -0,0 +1,698 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include "bcachefs.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
|
||||
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
|
||||
struct bch_replicas_cpu *);
|
||||
|
||||
/* Replicas tracking - in memory: */
|
||||
|
||||
#define for_each_cpu_replicas_entry(_r, _i) \
|
||||
for (_i = (_r)->entries; \
|
||||
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
|
||||
_i = (void *) (_i) + (_r)->entry_size)
|
||||
|
||||
static inline struct bch_replicas_cpu_entry *
|
||||
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
|
||||
{
|
||||
return (void *) r->entries + r->entry_size * i;
|
||||
}
|
||||
|
||||
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
|
||||
{
|
||||
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
|
||||
}
|
||||
|
||||
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
|
||||
unsigned dev)
|
||||
{
|
||||
return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
|
||||
}
|
||||
|
||||
static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
|
||||
unsigned dev)
|
||||
{
|
||||
e->devs[dev >> 3] |= 1 << (dev & 7);
|
||||
}
|
||||
|
||||
static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
|
||||
{
|
||||
return (r->entry_size -
|
||||
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
|
||||
}
|
||||
|
||||
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
|
||||
char *buf, size_t size)
|
||||
{
|
||||
char *out = buf, *end = out + size;
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
bool first = true;
|
||||
unsigned i;
|
||||
|
||||
for_each_cpu_replicas_entry(r, e) {
|
||||
bool first_e = true;
|
||||
|
||||
if (!first)
|
||||
out += scnprintf(out, end - out, " ");
|
||||
first = false;
|
||||
|
||||
out += scnprintf(out, end - out, "%u: [", e->data_type);
|
||||
|
||||
for (i = 0; i < replicas_dev_slots(r); i++)
|
||||
if (replicas_test_dev(e, i)) {
|
||||
if (!first_e)
|
||||
out += scnprintf(out, end - out, " ");
|
||||
first_e = false;
|
||||
out += scnprintf(out, end - out, "%u", i);
|
||||
}
|
||||
out += scnprintf(out, end - out, "]");
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
|
||||
enum bch_data_type data_type,
|
||||
struct bch_replicas_cpu_entry *r,
|
||||
unsigned *max_dev)
|
||||
{
|
||||
const struct bch_extent_ptr *ptr;
|
||||
unsigned nr = 0;
|
||||
|
||||
BUG_ON(!data_type ||
|
||||
data_type == BCH_DATA_SB ||
|
||||
data_type >= BCH_DATA_NR);
|
||||
|
||||
memset(r, 0, sizeof(*r));
|
||||
r->data_type = data_type;
|
||||
|
||||
*max_dev = 0;
|
||||
|
||||
extent_for_each_ptr(e, ptr)
|
||||
if (!ptr->cached) {
|
||||
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
|
||||
replicas_set_dev(r, ptr->dev);
|
||||
nr++;
|
||||
}
|
||||
return nr;
|
||||
}
|
||||
|
||||
static inline void devlist_to_replicas(struct bch_devs_list devs,
|
||||
enum bch_data_type data_type,
|
||||
struct bch_replicas_cpu_entry *r,
|
||||
unsigned *max_dev)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
BUG_ON(!data_type ||
|
||||
data_type == BCH_DATA_SB ||
|
||||
data_type >= BCH_DATA_NR);
|
||||
|
||||
memset(r, 0, sizeof(*r));
|
||||
r->data_type = data_type;
|
||||
|
||||
*max_dev = 0;
|
||||
|
||||
for (i = 0; i < devs.nr; i++) {
|
||||
*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
|
||||
replicas_set_dev(r, devs.devs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static struct bch_replicas_cpu *
|
||||
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
|
||||
struct bch_replicas_cpu_entry new_entry,
|
||||
unsigned max_dev)
|
||||
{
|
||||
struct bch_replicas_cpu *new;
|
||||
unsigned i, nr, entry_size;
|
||||
|
||||
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
|
||||
DIV_ROUND_UP(max_dev + 1, 8);
|
||||
entry_size = max(entry_size, old->entry_size);
|
||||
nr = old->nr + 1;
|
||||
|
||||
new = kzalloc(sizeof(struct bch_replicas_cpu) +
|
||||
nr * entry_size, GFP_NOIO);
|
||||
if (!new)
|
||||
return NULL;
|
||||
|
||||
new->nr = nr;
|
||||
new->entry_size = entry_size;
|
||||
|
||||
for (i = 0; i < old->nr; i++)
|
||||
memcpy(cpu_replicas_entry(new, i),
|
||||
cpu_replicas_entry(old, i),
|
||||
min(new->entry_size, old->entry_size));
|
||||
|
||||
memcpy(cpu_replicas_entry(new, old->nr),
|
||||
&new_entry,
|
||||
new->entry_size);
|
||||
|
||||
bch2_cpu_replicas_sort(new);
|
||||
return new;
|
||||
}
|
||||
|
||||
static bool replicas_has_entry(struct bch_replicas_cpu *r,
|
||||
struct bch_replicas_cpu_entry search,
|
||||
unsigned max_dev)
|
||||
{
|
||||
return max_dev < replicas_dev_slots(r) &&
|
||||
eytzinger0_find(r->entries, r->nr,
|
||||
r->entry_size,
|
||||
memcmp, &search) < r->nr;
|
||||
}
|
||||
|
||||
noinline
|
||||
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
|
||||
struct bch_replicas_cpu_entry new_entry,
|
||||
unsigned max_dev)
|
||||
{
|
||||
struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
old_gc = rcu_dereference_protected(c->replicas_gc,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
|
||||
new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
|
||||
if (!new_gc)
|
||||
goto err;
|
||||
}
|
||||
|
||||
old_r = rcu_dereference_protected(c->replicas,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
if (!replicas_has_entry(old_r, new_entry, max_dev)) {
|
||||
new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
|
||||
if (!new_r)
|
||||
goto err;
|
||||
|
||||
ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* allocations done, now commit: */
|
||||
|
||||
if (new_r)
|
||||
bch2_write_super(c);
|
||||
|
||||
/* don't update in memory replicas until changes are persistent */
|
||||
|
||||
if (new_gc) {
|
||||
rcu_assign_pointer(c->replicas_gc, new_gc);
|
||||
kfree_rcu(old_gc, rcu);
|
||||
}
|
||||
|
||||
if (new_r) {
|
||||
rcu_assign_pointer(c->replicas, new_r);
|
||||
kfree_rcu(old_r, rcu);
|
||||
}
|
||||
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return 0;
|
||||
err:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
kfree(new_gc);
|
||||
kfree(new_r);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_mark_replicas(struct bch_fs *c,
|
||||
enum bch_data_type data_type,
|
||||
struct bch_devs_list devs)
|
||||
{
|
||||
struct bch_replicas_cpu_entry search;
|
||||
struct bch_replicas_cpu *r, *gc_r;
|
||||
unsigned max_dev;
|
||||
bool marked;
|
||||
|
||||
if (!devs.nr)
|
||||
return 0;
|
||||
|
||||
BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
|
||||
|
||||
devlist_to_replicas(devs, data_type, &search, &max_dev);
|
||||
|
||||
rcu_read_lock();
|
||||
r = rcu_dereference(c->replicas);
|
||||
gc_r = rcu_dereference(c->replicas_gc);
|
||||
marked = replicas_has_entry(r, search, max_dev) &&
|
||||
(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
|
||||
rcu_read_unlock();
|
||||
|
||||
return likely(marked) ? 0
|
||||
: bch2_mark_replicas_slowpath(c, search, max_dev);
|
||||
}
|
||||
|
||||
int bch2_mark_bkey_replicas(struct bch_fs *c,
|
||||
enum bch_data_type data_type,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
|
||||
unsigned i;
|
||||
int ret;
|
||||
|
||||
for (i = 0; i < cached.nr; i++)
|
||||
if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
|
||||
bch2_dev_list_single(cached.devs[i]))))
|
||||
return ret;
|
||||
|
||||
return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
|
||||
}
|
||||
|
||||
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
|
||||
{
|
||||
struct bch_replicas_cpu *new_r, *old_r;
|
||||
|
||||
lockdep_assert_held(&c->replicas_gc_lock);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
|
||||
new_r = rcu_dereference_protected(c->replicas_gc,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
rcu_assign_pointer(c->replicas_gc, NULL);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
|
||||
ret = -ENOSPC;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bch2_write_super(c);
|
||||
|
||||
/* don't update in memory replicas until changes are persistent */
|
||||
|
||||
old_r = rcu_dereference_protected(c->replicas,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
|
||||
rcu_assign_pointer(c->replicas, new_r);
|
||||
kfree_rcu(old_r, rcu);
|
||||
out:
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return ret;
|
||||
err:
|
||||
kfree_rcu(new_r, rcu);
|
||||
goto out;
|
||||
}
|
||||
|
||||
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
|
||||
{
|
||||
struct bch_replicas_cpu *dst, *src;
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
|
||||
lockdep_assert_held(&c->replicas_gc_lock);
|
||||
|
||||
mutex_lock(&c->sb_lock);
|
||||
BUG_ON(c->replicas_gc);
|
||||
|
||||
src = rcu_dereference_protected(c->replicas,
|
||||
lockdep_is_held(&c->sb_lock));
|
||||
|
||||
dst = kzalloc(sizeof(struct bch_replicas_cpu) +
|
||||
src->nr * src->entry_size, GFP_NOIO);
|
||||
if (!dst) {
|
||||
mutex_unlock(&c->sb_lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
dst->nr = 0;
|
||||
dst->entry_size = src->entry_size;
|
||||
|
||||
for_each_cpu_replicas_entry(src, e)
|
||||
if (!((1 << e->data_type) & typemask))
|
||||
memcpy(cpu_replicas_entry(dst, dst->nr++),
|
||||
e, dst->entry_size);
|
||||
|
||||
bch2_cpu_replicas_sort(dst);
|
||||
|
||||
rcu_assign_pointer(c->replicas_gc, dst);
|
||||
mutex_unlock(&c->sb_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Replicas tracking - superblock: */
|
||||
|
||||
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
|
||||
unsigned *nr,
|
||||
unsigned *bytes,
|
||||
unsigned *max_dev)
|
||||
{
|
||||
struct bch_replicas_entry *i;
|
||||
unsigned j;
|
||||
|
||||
*nr = 0;
|
||||
*bytes = sizeof(*r);
|
||||
*max_dev = 0;
|
||||
|
||||
if (!r)
|
||||
return;
|
||||
|
||||
for_each_replicas_entry(r, i) {
|
||||
for (j = 0; j < i->nr; j++)
|
||||
*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
|
||||
(*nr)++;
|
||||
}
|
||||
|
||||
*bytes = (void *) i - (void *) r;
|
||||
}
|
||||
|
||||
static struct bch_replicas_cpu *
|
||||
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
|
||||
{
|
||||
struct bch_replicas_cpu *cpu_r;
|
||||
unsigned i, nr, bytes, max_dev, entry_size;
|
||||
|
||||
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
|
||||
|
||||
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
|
||||
DIV_ROUND_UP(max_dev + 1, 8);
|
||||
|
||||
cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
|
||||
nr * entry_size, GFP_NOIO);
|
||||
if (!cpu_r)
|
||||
return NULL;
|
||||
|
||||
cpu_r->nr = nr;
|
||||
cpu_r->entry_size = entry_size;
|
||||
|
||||
if (nr) {
|
||||
struct bch_replicas_cpu_entry *dst =
|
||||
cpu_replicas_entry(cpu_r, 0);
|
||||
struct bch_replicas_entry *src = sb_r->entries;
|
||||
|
||||
while (dst < cpu_replicas_entry(cpu_r, nr)) {
|
||||
dst->data_type = src->data_type;
|
||||
for (i = 0; i < src->nr; i++)
|
||||
replicas_set_dev(dst, src->devs[i]);
|
||||
|
||||
src = replicas_entry_next(src);
|
||||
dst = (void *) dst + entry_size;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_cpu_replicas_sort(cpu_r);
|
||||
return cpu_r;
|
||||
}
|
||||
|
||||
int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
|
||||
{
|
||||
struct bch_sb_field_replicas *sb_r;
|
||||
struct bch_replicas_cpu *cpu_r, *old_r;
|
||||
|
||||
sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
|
||||
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
|
||||
if (!cpu_r)
|
||||
return -ENOMEM;
|
||||
|
||||
old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
|
||||
rcu_assign_pointer(c->replicas, cpu_r);
|
||||
if (old_r)
|
||||
kfree_rcu(old_r, rcu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
|
||||
struct bch_replicas_cpu *r)
|
||||
{
|
||||
struct bch_sb_field_replicas *sb_r;
|
||||
struct bch_replicas_entry *sb_e;
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
size_t i, bytes;
|
||||
|
||||
bytes = sizeof(struct bch_sb_field_replicas);
|
||||
|
||||
for_each_cpu_replicas_entry(r, e) {
|
||||
bytes += sizeof(struct bch_replicas_entry);
|
||||
for (i = 0; i < r->entry_size - 1; i++)
|
||||
bytes += hweight8(e->devs[i]);
|
||||
}
|
||||
|
||||
sb_r = bch2_sb_resize_replicas(&c->disk_sb,
|
||||
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
|
||||
if (!sb_r)
|
||||
return -ENOSPC;
|
||||
|
||||
memset(&sb_r->entries, 0,
|
||||
vstruct_end(&sb_r->field) -
|
||||
(void *) &sb_r->entries);
|
||||
|
||||
sb_e = sb_r->entries;
|
||||
for_each_cpu_replicas_entry(r, e) {
|
||||
sb_e->data_type = e->data_type;
|
||||
|
||||
for (i = 0; i < replicas_dev_slots(r); i++)
|
||||
if (replicas_test_dev(e, i))
|
||||
sb_e->devs[sb_e->nr++] = i;
|
||||
|
||||
sb_e = replicas_entry_next(sb_e);
|
||||
|
||||
BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
|
||||
{
|
||||
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
|
||||
struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
|
||||
struct bch_replicas_cpu *cpu_r = NULL;
|
||||
struct bch_replicas_entry *e;
|
||||
const char *err;
|
||||
unsigned i;
|
||||
|
||||
for_each_replicas_entry(sb_r, e) {
|
||||
err = "invalid replicas entry: invalid data type";
|
||||
if (e->data_type >= BCH_DATA_NR)
|
||||
goto err;
|
||||
|
||||
err = "invalid replicas entry: no devices";
|
||||
if (!e->nr)
|
||||
goto err;
|
||||
|
||||
err = "invalid replicas entry: too many devices";
|
||||
if (e->nr >= BCH_REPLICAS_MAX)
|
||||
goto err;
|
||||
|
||||
err = "invalid replicas entry: invalid device";
|
||||
for (i = 0; i < e->nr; i++)
|
||||
if (!bch2_dev_exists(sb, mi, e->devs[i]))
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = "cannot allocate memory";
|
||||
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
|
||||
if (!cpu_r)
|
||||
goto err;
|
||||
|
||||
sort_cmp_size(cpu_r->entries,
|
||||
cpu_r->nr,
|
||||
cpu_r->entry_size,
|
||||
memcmp, NULL);
|
||||
|
||||
for (i = 0; i + 1 < cpu_r->nr; i++) {
|
||||
struct bch_replicas_cpu_entry *l =
|
||||
cpu_replicas_entry(cpu_r, i);
|
||||
struct bch_replicas_cpu_entry *r =
|
||||
cpu_replicas_entry(cpu_r, i + 1);
|
||||
|
||||
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
|
||||
|
||||
err = "duplicate replicas entry";
|
||||
if (!memcmp(l, r, cpu_r->entry_size))
|
||||
goto err;
|
||||
}
|
||||
|
||||
err = NULL;
|
||||
err:
|
||||
kfree(cpu_r);
|
||||
return err;
|
||||
}
|
||||
|
||||
const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
|
||||
.validate = bch2_sb_validate_replicas,
|
||||
};
|
||||
|
||||
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
|
||||
{
|
||||
char *out = buf, *end = out + size;
|
||||
struct bch_replicas_entry *e;
|
||||
bool first = true;
|
||||
unsigned i;
|
||||
|
||||
if (!r) {
|
||||
out += scnprintf(out, end - out, "(no replicas section found)");
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
for_each_replicas_entry(r, e) {
|
||||
if (!first)
|
||||
out += scnprintf(out, end - out, " ");
|
||||
first = false;
|
||||
|
||||
out += scnprintf(out, end - out, "%u: [", e->data_type);
|
||||
|
||||
for (i = 0; i < e->nr; i++)
|
||||
out += scnprintf(out, end - out,
|
||||
i ? " %u" : "%u", e->devs[i]);
|
||||
out += scnprintf(out, end - out, "]");
|
||||
}
|
||||
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
/* Query replicas: */
|
||||
|
||||
bool bch2_replicas_marked(struct bch_fs *c,
|
||||
enum bch_data_type data_type,
|
||||
struct bch_devs_list devs)
|
||||
{
|
||||
struct bch_replicas_cpu_entry search;
|
||||
unsigned max_dev;
|
||||
bool ret;
|
||||
|
||||
if (!devs.nr)
|
||||
return true;
|
||||
|
||||
devlist_to_replicas(devs, data_type, &search, &max_dev);
|
||||
|
||||
rcu_read_lock();
|
||||
ret = replicas_has_entry(rcu_dereference(c->replicas),
|
||||
search, max_dev);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool bch2_bkey_replicas_marked(struct bch_fs *c,
|
||||
enum bch_data_type data_type,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < cached.nr; i++)
|
||||
if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
|
||||
bch2_dev_list_single(cached.devs[i])))
|
||||
return false;
|
||||
|
||||
return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
|
||||
}
|
||||
|
||||
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
|
||||
struct bch_devs_mask online_devs)
|
||||
{
|
||||
struct bch_sb_field_members *mi;
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
struct bch_replicas_cpu *r;
|
||||
unsigned i, dev, dev_slots, nr_online, nr_offline;
|
||||
struct replicas_status ret;
|
||||
|
||||
memset(&ret, 0, sizeof(ret));
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
|
||||
ret.replicas[i].nr_online = UINT_MAX;
|
||||
|
||||
mi = bch2_sb_get_members(c->disk_sb.sb);
|
||||
rcu_read_lock();
|
||||
|
||||
r = rcu_dereference(c->replicas);
|
||||
dev_slots = replicas_dev_slots(r);
|
||||
|
||||
for_each_cpu_replicas_entry(r, e) {
|
||||
if (e->data_type >= ARRAY_SIZE(ret.replicas))
|
||||
panic("e %p data_type %u\n", e, e->data_type);
|
||||
|
||||
nr_online = nr_offline = 0;
|
||||
|
||||
for (dev = 0; dev < dev_slots; dev++) {
|
||||
if (!replicas_test_dev(e, dev))
|
||||
continue;
|
||||
|
||||
BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
|
||||
|
||||
if (test_bit(dev, online_devs.d))
|
||||
nr_online++;
|
||||
else
|
||||
nr_offline++;
|
||||
}
|
||||
|
||||
ret.replicas[e->data_type].nr_online =
|
||||
min(ret.replicas[e->data_type].nr_online,
|
||||
nr_online);
|
||||
|
||||
ret.replicas[e->data_type].nr_offline =
|
||||
max(ret.replicas[e->data_type].nr_offline,
|
||||
nr_offline);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct replicas_status bch2_replicas_status(struct bch_fs *c)
|
||||
{
|
||||
return __bch2_replicas_status(c, bch2_online_devs(c));
|
||||
}
|
||||
|
||||
static bool have_enough_devs(struct replicas_status s,
|
||||
enum bch_data_type type,
|
||||
bool force_if_degraded,
|
||||
bool force_if_lost)
|
||||
{
|
||||
return (!s.replicas[type].nr_offline || force_if_degraded) &&
|
||||
(s.replicas[type].nr_online || force_if_lost);
|
||||
}
|
||||
|
||||
bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
|
||||
{
|
||||
return (have_enough_devs(s, BCH_DATA_JOURNAL,
|
||||
flags & BCH_FORCE_IF_METADATA_DEGRADED,
|
||||
flags & BCH_FORCE_IF_METADATA_LOST) &&
|
||||
have_enough_devs(s, BCH_DATA_BTREE,
|
||||
flags & BCH_FORCE_IF_METADATA_DEGRADED,
|
||||
flags & BCH_FORCE_IF_METADATA_LOST) &&
|
||||
have_enough_devs(s, BCH_DATA_USER,
|
||||
flags & BCH_FORCE_IF_DATA_DEGRADED,
|
||||
flags & BCH_FORCE_IF_DATA_LOST));
|
||||
}
|
||||
|
||||
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
|
||||
{
|
||||
struct replicas_status s = bch2_replicas_status(c);
|
||||
|
||||
return meta
|
||||
? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
|
||||
s.replicas[BCH_DATA_BTREE].nr_online)
|
||||
: s.replicas[BCH_DATA_USER].nr_online;
|
||||
}
|
||||
|
||||
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
|
||||
{
|
||||
struct bch_replicas_cpu_entry *e;
|
||||
struct bch_replicas_cpu *r;
|
||||
unsigned ret = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
r = rcu_dereference(c->replicas);
|
||||
|
||||
if (ca->dev_idx >= replicas_dev_slots(r))
|
||||
goto out;
|
||||
|
||||
for_each_cpu_replicas_entry(r, e)
|
||||
if (replicas_test_dev(e, ca->dev_idx))
|
||||
ret |= 1 << e->data_type;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user