2017-03-17 14:18:50 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
2018-10-06 12:46:55 +08:00
|
|
|
#include "alloc_foreground.h"
|
2020-12-18 04:08:58 +08:00
|
|
|
#include "bkey_buf.h"
|
2017-03-17 14:18:50 +08:00
|
|
|
#include "btree_gc.h"
|
|
|
|
#include "btree_update.h"
|
2019-01-22 04:32:13 +08:00
|
|
|
#include "btree_update_interior.h"
|
2017-03-17 14:18:50 +08:00
|
|
|
#include "buckets.h"
|
2018-11-05 12:10:09 +08:00
|
|
|
#include "disk_groups.h"
|
2017-03-17 14:18:50 +08:00
|
|
|
#include "inode.h"
|
|
|
|
#include "io.h"
|
|
|
|
#include "journal_reclaim.h"
|
|
|
|
#include "keylist.h"
|
|
|
|
#include "move.h"
|
|
|
|
#include "replicas.h"
|
|
|
|
#include "super-io.h"
|
|
|
|
#include "trace.h"
|
|
|
|
|
|
|
|
#include <linux/ioprio.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
|
|
|
|
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
|
|
|
|
|
|
|
|
struct moving_io {
|
|
|
|
struct list_head list;
|
|
|
|
struct closure cl;
|
|
|
|
bool read_completed;
|
|
|
|
|
|
|
|
unsigned read_sectors;
|
|
|
|
unsigned write_sectors;
|
|
|
|
|
|
|
|
struct bch_read_bio rbio;
|
|
|
|
|
|
|
|
struct migrate_write write;
|
|
|
|
/* Must be last since it is variable size */
|
|
|
|
struct bio_vec bi_inline_vecs[0];
|
|
|
|
};
|
|
|
|
|
|
|
|
struct moving_context {
|
|
|
|
/* Closure for waiting on all reads and writes to complete */
|
|
|
|
struct closure cl;
|
|
|
|
|
|
|
|
struct bch_move_stats *stats;
|
|
|
|
|
|
|
|
struct list_head reads;
|
|
|
|
|
|
|
|
/* in flight sectors: */
|
|
|
|
atomic_t read_sectors;
|
|
|
|
atomic_t write_sectors;
|
|
|
|
|
|
|
|
wait_queue_head_t wait;
|
|
|
|
};
|
|
|
|
|
2022-10-29 11:57:01 +08:00
|
|
|
int bch2_migrate_index_update(struct bch_write_op *op)
|
2017-03-17 14:18:50 +08:00
|
|
|
{
|
|
|
|
struct bch_fs *c = op->c;
|
2019-03-14 08:49:16 +08:00
|
|
|
struct btree_trans trans;
|
|
|
|
struct btree_iter *iter;
|
2017-03-17 14:18:50 +08:00
|
|
|
struct migrate_write *m =
|
|
|
|
container_of(op, struct migrate_write, op);
|
|
|
|
struct keylist *keys = &op->insert_keys;
|
2020-12-18 04:08:58 +08:00
|
|
|
struct bkey_buf _new, _insert;
|
2017-03-17 14:18:50 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2020-12-18 04:08:58 +08:00
|
|
|
bch2_bkey_buf_init(&_new);
|
|
|
|
bch2_bkey_buf_init(&_insert);
|
|
|
|
bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
|
|
|
|
|
2021-04-19 12:33:05 +08:00
|
|
|
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
|
2019-03-14 08:49:16 +08:00
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
iter = bch2_trans_get_iter(&trans, m->btree_id,
|
2019-03-14 08:49:16 +08:00
|
|
|
bkey_start_pos(&bch2_keylist_front(keys)->k),
|
|
|
|
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
while (1) {
|
2020-04-25 02:08:56 +08:00
|
|
|
struct bkey_s_c k;
|
2019-08-16 21:59:56 +08:00
|
|
|
struct bkey_i *insert;
|
2020-04-25 02:08:56 +08:00
|
|
|
struct bkey_i_extent *new;
|
2018-09-28 09:08:39 +08:00
|
|
|
const union bch_extent_entry *entry;
|
|
|
|
struct extent_ptr_decoded p;
|
2017-03-17 14:18:50 +08:00
|
|
|
bool did_work = false;
|
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-15 10:59:33 +08:00
|
|
|
bool extending = false, should_check_enospc;
|
|
|
|
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2020-04-25 02:08:56 +08:00
|
|
|
bch2_trans_reset(&trans, 0);
|
|
|
|
|
|
|
|
k = bch2_btree_iter_peek_slot(iter);
|
2019-03-28 10:03:30 +08:00
|
|
|
ret = bkey_err(k);
|
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-15 10:59:33 +08:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
2020-04-25 02:08:56 +08:00
|
|
|
|
|
|
|
new = bkey_i_to_extent(bch2_keylist_front(keys));
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
if (bversion_cmp(k.k->version, new->k.version) ||
|
2019-07-26 01:52:14 +08:00
|
|
|
!bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
|
2017-03-17 14:18:50 +08:00
|
|
|
goto nomatch;
|
|
|
|
|
2020-12-18 04:08:58 +08:00
|
|
|
bkey_reassemble(_insert.k, k);
|
|
|
|
insert = _insert.k;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2020-12-18 04:08:58 +08:00
|
|
|
bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
|
|
|
|
new = bkey_i_to_extent(_new.k);
|
2019-11-10 08:02:48 +08:00
|
|
|
bch2_cut_front(iter->pos, &new->k_i);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2019-11-10 08:02:48 +08:00
|
|
|
bch2_cut_front(iter->pos, insert);
|
|
|
|
bch2_cut_back(new->k.p, insert);
|
|
|
|
bch2_cut_back(insert->k.p, &new->k_i);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2020-07-11 07:49:34 +08:00
|
|
|
if (m->data_cmd == DATA_REWRITE) {
|
|
|
|
struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
|
|
|
|
bch2_bkey_has_device(bkey_i_to_s_c(insert),
|
|
|
|
m->data_opts.rewrite_dev);
|
|
|
|
if (!old_ptr)
|
|
|
|
goto nomatch;
|
|
|
|
|
|
|
|
if (old_ptr->cached)
|
|
|
|
extent_for_each_ptr(extent_i_to_s(new), new_ptr)
|
|
|
|
new_ptr->cached = true;
|
|
|
|
|
|
|
|
bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
|
|
|
|
}
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2018-09-28 09:08:39 +08:00
|
|
|
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
|
2019-08-16 21:59:56 +08:00
|
|
|
if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
|
2017-03-17 14:18:50 +08:00
|
|
|
/*
|
|
|
|
* raced with another move op? extent already
|
|
|
|
* has a pointer to the device we just wrote
|
|
|
|
* data to
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
bch2_extent_ptr_decoded_append(insert, &p);
|
2017-03-17 14:18:50 +08:00
|
|
|
did_work = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!did_work)
|
|
|
|
goto nomatch;
|
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
bch2_bkey_narrow_crcs(insert,
|
2017-03-17 14:18:50 +08:00
|
|
|
(struct bch_extent_crc_unpacked) { 0 });
|
2019-08-16 21:59:56 +08:00
|
|
|
bch2_extent_normalize(c, bkey_i_to_s(insert));
|
|
|
|
bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
|
|
|
|
op->opts.background_target,
|
|
|
|
op->opts.data_replicas);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-15 10:59:33 +08:00
|
|
|
ret = bch2_sum_sector_overwrites(&trans, iter, insert,
|
|
|
|
&extending,
|
|
|
|
&should_check_enospc,
|
|
|
|
&i_sectors_delta,
|
|
|
|
&disk_sectors_delta);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
2018-12-06 01:30:02 +08:00
|
|
|
|
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-15 10:59:33 +08:00
|
|
|
if (disk_sectors_delta > (s64) op->res.sectors) {
|
2017-03-17 14:18:50 +08:00
|
|
|
ret = bch2_disk_reservation_add(c, &op->res,
|
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-15 10:59:33 +08:00
|
|
|
disk_sectors_delta - op->res.sectors,
|
|
|
|
!should_check_enospc
|
|
|
|
? BCH_DISK_RESERVATION_NOFAIL : 0);
|
2017-03-17 14:18:50 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-01-01 05:17:42 +08:00
|
|
|
bch2_trans_update(&trans, iter, insert, 0);
|
2019-03-14 08:49:16 +08:00
|
|
|
|
|
|
|
ret = bch2_trans_commit(&trans, &op->res,
|
2018-08-09 07:53:30 +08:00
|
|
|
op_journal_seq(op),
|
2017-03-17 14:18:50 +08:00
|
|
|
BTREE_INSERT_NOFAIL|
|
2019-03-14 08:49:16 +08:00
|
|
|
m->data_opts.btree_insert_flags);
|
bcachefs: Change when we allow overwrites
Originally, we'd check for -ENOSPC when getting a disk reservation
whenever the new extent took up more space on disk than the old extent.
Erasure coding screwed this up, because with erasure coding writes are
initially replicated, and then in the background the extra replicas are
dropped when the stripe is created. This means that with erasure coding
enabled, writes will always take up more space on disk than the data
they're overwriting - but, according to posix, overwrites aren't
supposed to return ENOSPC.
So, in this patch we fudge things: if the new extent has more replicas
than the _effective_ replicas of the old extent, or if the old extent is
compressed and the new one isn't, we check for ENOSPC when getting the
disk reservation - otherwise, we don't.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2020-12-15 10:59:33 +08:00
|
|
|
err:
|
2017-03-17 14:18:50 +08:00
|
|
|
if (!ret)
|
|
|
|
atomic_long_inc(&c->extent_migrate_done);
|
|
|
|
if (ret == -EINTR)
|
|
|
|
ret = 0;
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
next:
|
2019-03-14 08:49:16 +08:00
|
|
|
while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
|
2017-03-17 14:18:50 +08:00
|
|
|
bch2_keylist_pop_front(keys);
|
|
|
|
if (bch2_keylist_empty(keys))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
nomatch:
|
2020-06-04 11:47:50 +08:00
|
|
|
if (m->ctxt) {
|
|
|
|
BUG_ON(k.k->p.offset <= iter->pos.offset);
|
|
|
|
atomic64_inc(&m->ctxt->stats->keys_raced);
|
2019-03-14 08:49:16 +08:00
|
|
|
atomic64_add(k.k->p.offset - iter->pos.offset,
|
2017-03-17 14:18:50 +08:00
|
|
|
&m->ctxt->stats->sectors_raced);
|
2020-06-04 11:47:50 +08:00
|
|
|
}
|
2017-03-17 14:18:50 +08:00
|
|
|
atomic_long_inc(&c->extent_migrate_raced);
|
|
|
|
trace_move_race(&new->k);
|
2019-03-14 08:49:16 +08:00
|
|
|
bch2_btree_iter_next_slot(iter);
|
2017-03-17 14:18:50 +08:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
out:
|
2021-03-20 08:29:11 +08:00
|
|
|
bch2_trans_iter_put(&trans, iter);
|
2019-03-14 08:49:16 +08:00
|
|
|
bch2_trans_exit(&trans);
|
2020-12-18 04:08:58 +08:00
|
|
|
bch2_bkey_buf_exit(&_insert, c);
|
|
|
|
bch2_bkey_buf_exit(&_new, c);
|
2019-03-12 02:59:58 +08:00
|
|
|
BUG_ON(ret == -EINTR);
|
2017-03-17 14:18:50 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
|
|
|
|
{
|
|
|
|
/* write bio must own pages: */
|
|
|
|
BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
|
|
|
|
|
|
|
m->ptr = rbio->pick.ptr;
|
2021-03-15 09:30:08 +08:00
|
|
|
m->offset = rbio->data_pos.offset - rbio->pick.crc.offset;
|
2017-03-17 14:18:50 +08:00
|
|
|
m->op.devs_have = rbio->devs_have;
|
2021-03-15 09:30:08 +08:00
|
|
|
m->op.pos = rbio->data_pos;
|
2017-03-17 14:18:50 +08:00
|
|
|
m->op.version = rbio->version;
|
|
|
|
m->op.crc = rbio->pick.crc;
|
|
|
|
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
|
|
|
|
|
|
|
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
|
|
|
|
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
|
|
|
|
m->op.csum_type = m->op.crc.csum_type;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (m->data_cmd == DATA_REWRITE)
|
|
|
|
bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
|
|
|
|
struct write_point_specifier wp,
|
|
|
|
struct bch_io_opts io_opts,
|
|
|
|
enum data_cmd data_cmd,
|
|
|
|
struct data_opts data_opts,
|
2019-08-16 21:59:56 +08:00
|
|
|
enum btree_id btree_id,
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bkey_s_c k)
|
|
|
|
{
|
2018-02-24 05:26:10 +08:00
|
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
|
|
const union bch_extent_entry *entry;
|
|
|
|
struct extent_ptr_decoded p;
|
2017-03-17 14:18:50 +08:00
|
|
|
int ret;
|
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
m->btree_id = btree_id;
|
2017-03-17 14:18:50 +08:00
|
|
|
m->data_cmd = data_cmd;
|
|
|
|
m->data_opts = data_opts;
|
|
|
|
m->nr_ptrs_reserved = 0;
|
|
|
|
|
|
|
|
bch2_write_op_init(&m->op, c, io_opts);
|
2018-02-24 05:26:10 +08:00
|
|
|
|
|
|
|
if (!bch2_bkey_is_incompressible(k))
|
|
|
|
m->op.compression_type =
|
|
|
|
bch2_compression_opt_to_type[io_opts.background_compression ?:
|
|
|
|
io_opts.compression];
|
|
|
|
else
|
|
|
|
m->op.incompressible = true;
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
m->op.target = data_opts.target,
|
|
|
|
m->op.write_point = wp;
|
|
|
|
|
2020-07-12 06:52:14 +08:00
|
|
|
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
|
2017-03-17 14:18:50 +08:00
|
|
|
m->op.alloc_reserve = RESERVE_MOVINGGC;
|
2020-07-22 05:12:39 +08:00
|
|
|
m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
|
2020-07-12 06:52:14 +08:00
|
|
|
} else {
|
|
|
|
/* XXX: this should probably be passed in */
|
|
|
|
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
|
|
|
}
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2020-07-12 06:52:14 +08:00
|
|
|
m->op.flags |= BCH_WRITE_PAGES_STABLE|
|
2017-03-17 14:18:50 +08:00
|
|
|
BCH_WRITE_PAGES_OWNED|
|
2020-03-10 02:19:58 +08:00
|
|
|
BCH_WRITE_DATA_ENCODED|
|
2022-10-29 11:57:01 +08:00
|
|
|
BCH_WRITE_FROM_INTERNAL|
|
|
|
|
BCH_WRITE_MOVE;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2020-07-23 11:11:48 +08:00
|
|
|
m->op.nr_replicas = data_opts.nr_replicas;
|
|
|
|
m->op.nr_replicas_required = data_opts.nr_replicas;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
switch (data_cmd) {
|
|
|
|
case DATA_ADD_REPLICAS: {
|
2018-12-06 01:30:02 +08:00
|
|
|
/*
|
|
|
|
* DATA_ADD_REPLICAS is used for moving data to a different
|
|
|
|
* device in the background, and due to compression the new copy
|
|
|
|
* might take up more space than the old copy:
|
|
|
|
*/
|
|
|
|
#if 0
|
2017-03-17 14:18:50 +08:00
|
|
|
int nr = (int) io_opts.data_replicas -
|
2019-11-17 05:25:58 +08:00
|
|
|
bch2_bkey_nr_ptrs_allocated(k);
|
2018-12-06 01:30:02 +08:00
|
|
|
#endif
|
|
|
|
int nr = (int) io_opts.data_replicas;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
if (nr > 0) {
|
|
|
|
m->op.nr_replicas = m->nr_ptrs_reserved = nr;
|
|
|
|
|
|
|
|
ret = bch2_disk_reservation_get(c, &m->op.res,
|
|
|
|
k.k->size, m->op.nr_replicas, 0);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2018-11-05 12:10:09 +08:00
|
|
|
case DATA_REWRITE: {
|
|
|
|
unsigned compressed_sectors = 0;
|
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
2020-10-16 10:23:02 +08:00
|
|
|
if (p.ptr.dev == data_opts.rewrite_dev &&
|
|
|
|
!p.ptr.cached &&
|
|
|
|
crc_is_compressed(p.crc))
|
2018-11-05 12:10:09 +08:00
|
|
|
compressed_sectors += p.crc.compressed_size;
|
|
|
|
|
|
|
|
if (compressed_sectors) {
|
|
|
|
ret = bch2_disk_reservation_add(c, &m->op.res,
|
2020-10-16 10:23:02 +08:00
|
|
|
k.k->size * m->op.nr_replicas,
|
2018-11-05 12:10:09 +08:00
|
|
|
BCH_DISK_RESERVATION_NOFAIL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2017-03-17 14:18:50 +08:00
|
|
|
break;
|
2018-11-05 12:10:09 +08:00
|
|
|
}
|
2017-03-17 14:18:50 +08:00
|
|
|
case DATA_PROMOTE:
|
|
|
|
m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
|
|
|
|
m->op.flags |= BCH_WRITE_CACHED;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void move_free(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
|
|
|
struct moving_context *ctxt = io->write.ctxt;
|
|
|
|
struct bvec_iter_all iter;
|
|
|
|
struct bio_vec *bv;
|
|
|
|
|
|
|
|
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
|
|
|
|
|
|
|
|
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
|
|
|
|
if (bv->bv_page)
|
|
|
|
__free_page(bv->bv_page);
|
|
|
|
|
|
|
|
wake_up(&ctxt->wait);
|
|
|
|
|
|
|
|
kfree(io);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void move_write_done(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
|
|
|
|
|
|
|
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
|
|
closure_return_with_destructor(cl, move_free);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void move_write(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct moving_io *io = container_of(cl, struct moving_io, cl);
|
|
|
|
|
|
|
|
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
|
|
|
closure_return_with_destructor(cl, move_free);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
bch2_migrate_read_done(&io->write, &io->rbio);
|
|
|
|
|
|
|
|
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
|
|
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
|
|
|
|
continue_at(cl, move_write_done, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
|
|
|
|
{
|
|
|
|
struct moving_io *io =
|
|
|
|
list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
|
|
|
|
|
|
|
|
return io && io->read_completed ? io : NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void move_read_endio(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
|
|
|
struct moving_context *ctxt = io->write.ctxt;
|
|
|
|
|
|
|
|
atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
|
|
|
io->read_completed = true;
|
|
|
|
|
|
|
|
if (next_pending_write(ctxt))
|
|
|
|
wake_up(&ctxt->wait);
|
|
|
|
|
|
|
|
closure_put(&ctxt->cl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void do_pending_writes(struct moving_context *ctxt)
|
|
|
|
{
|
|
|
|
struct moving_io *io;
|
|
|
|
|
|
|
|
while ((io = next_pending_write(ctxt))) {
|
|
|
|
list_del(&io->list);
|
|
|
|
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define move_ctxt_wait_event(_ctxt, _cond) \
|
|
|
|
do { \
|
|
|
|
do_pending_writes(_ctxt); \
|
|
|
|
\
|
|
|
|
if (_cond) \
|
|
|
|
break; \
|
|
|
|
__wait_event((_ctxt)->wait, \
|
|
|
|
next_pending_write(_ctxt) || (_cond)); \
|
|
|
|
} while (1)
|
|
|
|
|
|
|
|
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
|
|
|
|
{
|
|
|
|
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
|
|
|
|
|
|
|
|
move_ctxt_wait_event(ctxt,
|
|
|
|
!atomic_read(&ctxt->write_sectors) ||
|
|
|
|
atomic_read(&ctxt->write_sectors) != sectors_pending);
|
|
|
|
}
|
|
|
|
|
2020-10-17 09:39:16 +08:00
|
|
|
static int bch2_move_extent(struct btree_trans *trans,
|
2017-03-17 14:18:50 +08:00
|
|
|
struct moving_context *ctxt,
|
|
|
|
struct write_point_specifier wp,
|
|
|
|
struct bch_io_opts io_opts,
|
2019-08-16 21:59:56 +08:00
|
|
|
enum btree_id btree_id,
|
2019-07-26 01:52:14 +08:00
|
|
|
struct bkey_s_c k,
|
2017-03-17 14:18:50 +08:00
|
|
|
enum data_cmd data_cmd,
|
|
|
|
struct data_opts data_opts)
|
|
|
|
{
|
2020-10-17 09:39:16 +08:00
|
|
|
struct bch_fs *c = trans->c;
|
2019-07-26 01:52:14 +08:00
|
|
|
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
2017-03-17 14:18:50 +08:00
|
|
|
struct moving_io *io;
|
2018-09-28 09:08:39 +08:00
|
|
|
const union bch_extent_entry *entry;
|
|
|
|
struct extent_ptr_decoded p;
|
2019-07-26 01:52:14 +08:00
|
|
|
unsigned sectors = k.k->size, pages;
|
2017-03-17 14:18:50 +08:00
|
|
|
int ret = -ENOMEM;
|
|
|
|
|
|
|
|
move_ctxt_wait_event(ctxt,
|
|
|
|
atomic_read(&ctxt->write_sectors) <
|
|
|
|
SECTORS_IN_FLIGHT_PER_DEVICE);
|
|
|
|
|
|
|
|
move_ctxt_wait_event(ctxt,
|
|
|
|
atomic_read(&ctxt->read_sectors) <
|
|
|
|
SECTORS_IN_FLIGHT_PER_DEVICE);
|
|
|
|
|
|
|
|
/* write path might have to decompress data: */
|
2019-07-26 01:52:14 +08:00
|
|
|
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
2018-09-28 09:08:39 +08:00
|
|
|
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
|
|
io = kzalloc(sizeof(struct moving_io) +
|
|
|
|
sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
|
|
|
if (!io)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
io->write.ctxt = ctxt;
|
2019-07-26 01:52:14 +08:00
|
|
|
io->read_sectors = k.k->size;
|
|
|
|
io->write_sectors = k.k->size;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
|
|
|
bio_set_prio(&io->write.op.wbio.bio,
|
|
|
|
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
|
|
|
|
|
|
|
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
|
|
|
|
GFP_KERNEL))
|
|
|
|
goto err_free;
|
|
|
|
|
2019-09-08 01:16:41 +08:00
|
|
|
io->rbio.c = c;
|
|
|
|
io->rbio.opts = io_opts;
|
2017-03-17 14:18:50 +08:00
|
|
|
bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
|
|
|
|
io->rbio.bio.bi_vcnt = pages;
|
|
|
|
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
|
|
|
io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
|
|
|
|
|
|
|
io->rbio.bio.bi_opf = REQ_OP_READ;
|
2019-07-26 01:52:14 +08:00
|
|
|
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
2017-03-17 14:18:50 +08:00
|
|
|
io->rbio.bio.bi_end_io = move_read_endio;
|
|
|
|
|
|
|
|
ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
|
2019-08-16 21:59:56 +08:00
|
|
|
data_cmd, data_opts, btree_id, k);
|
2017-03-17 14:18:50 +08:00
|
|
|
if (ret)
|
|
|
|
goto err_free_pages;
|
|
|
|
|
|
|
|
atomic64_inc(&ctxt->stats->keys_moved);
|
2019-07-26 01:52:14 +08:00
|
|
|
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2019-07-26 01:52:14 +08:00
|
|
|
trace_move_extent(k.k);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
atomic_add(io->read_sectors, &ctxt->read_sectors);
|
|
|
|
list_add_tail(&io->list, &ctxt->reads);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dropped by move_read_endio() - guards against use after free of
|
|
|
|
* ctxt when doing wakeup
|
|
|
|
*/
|
|
|
|
closure_get(&ctxt->cl);
|
2021-03-15 09:30:08 +08:00
|
|
|
bch2_read_extent(trans, &io->rbio,
|
|
|
|
bkey_start_pos(k.k),
|
|
|
|
btree_id, k, 0,
|
2017-03-17 14:18:50 +08:00
|
|
|
BCH_READ_NODECODE|
|
|
|
|
BCH_READ_LAST_FRAGMENT);
|
|
|
|
return 0;
|
|
|
|
err_free_pages:
|
|
|
|
bio_free_pages(&io->write.op.wbio.bio);
|
|
|
|
err_free:
|
|
|
|
kfree(io);
|
|
|
|
err:
|
2019-07-26 01:52:14 +08:00
|
|
|
trace_move_alloc_fail(k.k);
|
2017-03-17 14:18:50 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-03-17 06:08:10 +08:00
|
|
|
static int lookup_inode(struct btree_trans *trans, struct bpos pos,
|
|
|
|
struct bch_inode_unpacked *inode)
|
|
|
|
{
|
|
|
|
struct btree_iter *iter;
|
|
|
|
struct bkey_s_c k;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
|
|
|
|
BTREE_ITER_ALL_SNAPSHOTS);
|
|
|
|
k = bch2_btree_iter_peek(iter);
|
|
|
|
ret = bkey_err(k);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2021-05-24 06:42:51 +08:00
|
|
|
if (!k.k || bkey_cmp(k.k->p, pos)) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2021-03-17 06:08:10 +08:00
|
|
|
ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
err:
|
|
|
|
bch2_trans_iter_put(trans, iter);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
static int __bch2_move_data(struct bch_fs *c,
|
|
|
|
struct moving_context *ctxt,
|
|
|
|
struct bch_ratelimit *rate,
|
|
|
|
struct write_point_specifier wp,
|
|
|
|
struct bpos start,
|
|
|
|
struct bpos end,
|
|
|
|
move_pred_fn pred, void *arg,
|
|
|
|
struct bch_move_stats *stats,
|
|
|
|
enum btree_id btree_id)
|
2017-03-17 14:18:50 +08:00
|
|
|
{
|
|
|
|
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
|
|
|
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
2020-12-18 04:08:58 +08:00
|
|
|
struct bkey_buf sk;
|
2019-03-26 03:10:15 +08:00
|
|
|
struct btree_trans trans;
|
|
|
|
struct btree_iter *iter;
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bkey_s_c k;
|
|
|
|
struct data_opts data_opts;
|
|
|
|
enum data_cmd data_cmd;
|
2018-09-26 11:27:57 +08:00
|
|
|
u64 delay, cur_inum = U64_MAX;
|
2017-03-17 14:18:50 +08:00
|
|
|
int ret = 0, ret2;
|
|
|
|
|
2020-12-18 04:08:58 +08:00
|
|
|
bch2_bkey_buf_init(&sk);
|
2019-05-15 22:54:43 +08:00
|
|
|
bch2_trans_init(&trans, c, 0, 0);
|
2019-03-26 03:10:15 +08:00
|
|
|
|
2020-07-10 06:28:11 +08:00
|
|
|
stats->data_type = BCH_DATA_user;
|
2019-08-16 21:59:56 +08:00
|
|
|
stats->btree_id = btree_id;
|
2021-03-15 07:01:14 +08:00
|
|
|
stats->pos = start;
|
2019-03-26 03:10:15 +08:00
|
|
|
|
2019-08-16 21:59:56 +08:00
|
|
|
iter = bch2_trans_get_iter(&trans, btree_id, start,
|
2019-03-26 03:10:15 +08:00
|
|
|
BTREE_ITER_PREFETCH);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
if (rate)
|
|
|
|
bch2_ratelimit_reset(rate);
|
|
|
|
|
2018-09-26 11:27:57 +08:00
|
|
|
while (1) {
|
|
|
|
do {
|
|
|
|
delay = rate ? bch2_ratelimit_delay(rate) : 0;
|
|
|
|
|
|
|
|
if (delay) {
|
2019-03-26 03:10:15 +08:00
|
|
|
bch2_trans_unlock(&trans);
|
2018-09-26 11:27:57 +08:00
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (kthread && (ret = kthread_should_stop())) {
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (delay)
|
|
|
|
schedule_timeout(delay);
|
|
|
|
|
|
|
|
if (unlikely(freezing(current))) {
|
2019-03-26 03:10:15 +08:00
|
|
|
bch2_trans_unlock(&trans);
|
2019-08-16 21:59:56 +08:00
|
|
|
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
|
2018-09-26 11:27:57 +08:00
|
|
|
try_to_freeze();
|
|
|
|
}
|
|
|
|
} while (delay);
|
2021-03-17 06:08:10 +08:00
|
|
|
|
2019-03-26 03:10:15 +08:00
|
|
|
k = bch2_btree_iter_peek(iter);
|
|
|
|
|
|
|
|
stats->pos = iter->pos;
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
if (!k.k)
|
|
|
|
break;
|
2019-03-28 10:03:30 +08:00
|
|
|
ret = bkey_err(k);
|
2017-03-17 14:18:50 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
|
|
|
|
break;
|
|
|
|
|
2019-09-08 04:13:20 +08:00
|
|
|
if (!bkey_extent_is_direct_data(k.k))
|
2017-03-17 14:18:50 +08:00
|
|
|
goto next_nondata;
|
|
|
|
|
2021-02-21 08:27:37 +08:00
|
|
|
if (btree_id == BTREE_ID_extents &&
|
2019-10-05 02:39:38 +08:00
|
|
|
cur_inum != k.k->p.inode) {
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bch_inode_unpacked inode;
|
|
|
|
|
|
|
|
io_opts = bch2_opts_to_inode_opts(c->opts);
|
2021-03-17 06:08:10 +08:00
|
|
|
|
|
|
|
ret = lookup_inode(&trans,
|
|
|
|
SPOS(0, k.k->p.inode, k.k->p.snapshot),
|
|
|
|
&inode);
|
|
|
|
if (ret == -EINTR)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!ret)
|
2017-03-17 14:18:50 +08:00
|
|
|
bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
|
2021-03-17 06:08:10 +08:00
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
cur_inum = k.k->p.inode;
|
|
|
|
}
|
|
|
|
|
2018-11-02 03:10:01 +08:00
|
|
|
switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
|
2017-03-17 14:18:50 +08:00
|
|
|
case DATA_SKIP:
|
|
|
|
goto next;
|
|
|
|
case DATA_SCRUB:
|
|
|
|
BUG();
|
|
|
|
case DATA_ADD_REPLICAS:
|
|
|
|
case DATA_REWRITE:
|
|
|
|
case DATA_PROMOTE:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unlock before doing IO: */
|
2020-12-18 04:08:58 +08:00
|
|
|
bch2_bkey_buf_reassemble(&sk, c, k);
|
2019-11-10 05:01:15 +08:00
|
|
|
k = bkey_i_to_s_c(sk.k);
|
2019-03-26 03:10:15 +08:00
|
|
|
bch2_trans_unlock(&trans);
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2020-10-17 09:39:16 +08:00
|
|
|
ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
|
2017-03-17 14:18:50 +08:00
|
|
|
data_cmd, data_opts);
|
|
|
|
if (ret2) {
|
2020-12-21 10:42:19 +08:00
|
|
|
if (ret2 == -EINTR) {
|
|
|
|
bch2_trans_reset(&trans, 0);
|
|
|
|
bch2_trans_cond_resched(&trans);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
if (ret2 == -ENOMEM) {
|
|
|
|
/* memory allocation failure, wait for some IO to finish */
|
2019-08-16 21:59:56 +08:00
|
|
|
bch2_move_ctxt_wait_for_io(ctxt);
|
2017-03-17 14:18:50 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX signal failure */
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rate)
|
|
|
|
bch2_ratelimit_increment(rate, k.k->size);
|
|
|
|
next:
|
2019-11-17 05:25:58 +08:00
|
|
|
atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
|
2017-03-17 14:18:50 +08:00
|
|
|
&stats->sectors_seen);
|
|
|
|
next_nondata:
|
2021-03-22 04:55:25 +08:00
|
|
|
bch2_btree_iter_advance(iter);
|
2019-03-26 03:10:15 +08:00
|
|
|
bch2_trans_cond_resched(&trans);
|
2017-03-17 14:18:50 +08:00
|
|
|
}
|
2018-09-26 11:27:57 +08:00
|
|
|
out:
|
2021-03-20 08:29:11 +08:00
|
|
|
|
|
|
|
bch2_trans_iter_put(&trans, iter);
|
2019-08-16 21:59:56 +08:00
|
|
|
ret = bch2_trans_exit(&trans) ?: ret;
|
2020-12-18 04:08:58 +08:00
|
|
|
bch2_bkey_buf_exit(&sk, c);
|
2019-08-16 21:59:56 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_move_data(struct bch_fs *c,
|
2021-03-15 07:01:14 +08:00
|
|
|
enum btree_id start_btree_id, struct bpos start_pos,
|
|
|
|
enum btree_id end_btree_id, struct bpos end_pos,
|
2019-08-16 21:59:56 +08:00
|
|
|
struct bch_ratelimit *rate,
|
|
|
|
struct write_point_specifier wp,
|
|
|
|
move_pred_fn pred, void *arg,
|
|
|
|
struct bch_move_stats *stats)
|
|
|
|
{
|
|
|
|
struct moving_context ctxt = { .stats = stats };
|
2021-03-15 07:01:14 +08:00
|
|
|
enum btree_id id;
|
2019-08-16 21:59:56 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
closure_init_stack(&ctxt.cl);
|
|
|
|
INIT_LIST_HEAD(&ctxt.reads);
|
|
|
|
init_waitqueue_head(&ctxt.wait);
|
|
|
|
|
2020-07-10 06:28:11 +08:00
|
|
|
stats->data_type = BCH_DATA_user;
|
2019-08-16 21:59:56 +08:00
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
for (id = start_btree_id;
|
|
|
|
id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
|
|
|
|
id++) {
|
|
|
|
stats->btree_id = id;
|
|
|
|
|
2021-02-21 08:27:37 +08:00
|
|
|
if (id != BTREE_ID_extents &&
|
|
|
|
id != BTREE_ID_reflink)
|
2021-03-15 07:01:14 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
ret = __bch2_move_data(c, &ctxt, rate, wp,
|
|
|
|
id == start_btree_id ? start_pos : POS_MIN,
|
|
|
|
id == end_btree_id ? end_pos : POS_MAX,
|
|
|
|
pred, arg, stats, id);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
|
|
|
|
closure_sync(&ctxt.cl);
|
|
|
|
|
|
|
|
EBUG_ON(atomic_read(&ctxt.write_sectors));
|
|
|
|
|
|
|
|
trace_move_data(c,
|
|
|
|
atomic64_read(&stats->sectors_moved),
|
|
|
|
atomic64_read(&stats->keys_moved));
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
|
|
|
|
struct btree *, struct bch_io_opts *,
|
|
|
|
struct data_opts *);
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
static int bch2_move_btree(struct bch_fs *c,
|
2021-03-15 07:01:14 +08:00
|
|
|
enum btree_id start_btree_id, struct bpos start_pos,
|
|
|
|
enum btree_id end_btree_id, struct bpos end_pos,
|
|
|
|
move_btree_pred pred, void *arg,
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bch_move_stats *stats)
|
|
|
|
{
|
2021-03-15 07:01:14 +08:00
|
|
|
bool kthread = (current->flags & PF_KTHREAD) != 0;
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
2019-03-26 03:10:15 +08:00
|
|
|
struct btree_trans trans;
|
|
|
|
struct btree_iter *iter;
|
2017-03-17 14:18:50 +08:00
|
|
|
struct btree *b;
|
2021-03-15 07:01:14 +08:00
|
|
|
enum btree_id id;
|
2017-03-17 14:18:50 +08:00
|
|
|
struct data_opts data_opts;
|
|
|
|
enum data_cmd cmd;
|
|
|
|
int ret = 0;
|
|
|
|
|
2019-05-15 22:54:43 +08:00
|
|
|
bch2_trans_init(&trans, c, 0, 0);
|
2019-03-26 03:10:15 +08:00
|
|
|
|
2020-07-10 06:28:11 +08:00
|
|
|
stats->data_type = BCH_DATA_btree;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
for (id = start_btree_id;
|
|
|
|
id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
|
|
|
|
id++) {
|
2019-03-26 03:10:15 +08:00
|
|
|
stats->btree_id = id;
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
for_each_btree_node(&trans, iter, id,
|
|
|
|
id == start_btree_id ? start_pos : POS_MIN,
|
2019-03-26 03:10:15 +08:00
|
|
|
BTREE_ITER_PREFETCH, b) {
|
2021-03-15 07:01:14 +08:00
|
|
|
if (kthread && kthread_should_stop())
|
2021-04-21 08:21:39 +08:00
|
|
|
break;
|
2021-03-15 07:01:14 +08:00
|
|
|
|
|
|
|
if ((cmp_int(id, end_btree_id) ?:
|
|
|
|
bkey_cmp(b->key.k.p, end_pos)) > 0)
|
|
|
|
break;
|
|
|
|
|
2019-03-26 03:10:15 +08:00
|
|
|
stats->pos = iter->pos;
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
|
2017-03-17 14:18:50 +08:00
|
|
|
case DATA_SKIP:
|
|
|
|
goto next;
|
|
|
|
case DATA_SCRUB:
|
|
|
|
BUG();
|
|
|
|
case DATA_ADD_REPLICAS:
|
|
|
|
case DATA_REWRITE:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
2019-03-26 03:10:15 +08:00
|
|
|
ret = bch2_btree_node_rewrite(c, iter,
|
2017-03-17 14:18:50 +08:00
|
|
|
b->data->keys.seq, 0) ?: ret;
|
|
|
|
next:
|
2019-03-26 03:10:15 +08:00
|
|
|
bch2_trans_cond_resched(&trans);
|
2017-03-17 14:18:50 +08:00
|
|
|
}
|
|
|
|
|
2019-03-26 03:10:15 +08:00
|
|
|
ret = bch2_trans_iter_free(&trans, iter) ?: ret;
|
2021-04-21 08:21:39 +08:00
|
|
|
if (kthread && kthread_should_stop())
|
|
|
|
break;
|
2017-03-17 14:18:50 +08:00
|
|
|
}
|
2021-04-21 08:21:39 +08:00
|
|
|
|
2019-03-26 03:10:15 +08:00
|
|
|
bch2_trans_exit(&trans);
|
|
|
|
|
2021-04-05 09:57:35 +08:00
|
|
|
if (ret)
|
|
|
|
bch_err(c, "error %i in bch2_move_btree", ret);
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
|
2018-11-02 03:10:01 +08:00
|
|
|
struct bkey_s_c k,
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bch_io_opts *io_opts,
|
|
|
|
struct data_opts *data_opts)
|
|
|
|
{
|
|
|
|
return DATA_SCRUB;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
|
2018-11-02 03:10:01 +08:00
|
|
|
struct bkey_s_c k,
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bch_io_opts *io_opts,
|
|
|
|
struct data_opts *data_opts)
|
|
|
|
{
|
2018-11-02 03:10:01 +08:00
|
|
|
unsigned nr_good = bch2_bkey_durability(c, k);
|
|
|
|
unsigned replicas = 0;
|
|
|
|
|
|
|
|
switch (k.k->type) {
|
|
|
|
case KEY_TYPE_btree_ptr:
|
|
|
|
replicas = c->opts.metadata_replicas;
|
|
|
|
break;
|
|
|
|
case KEY_TYPE_extent:
|
|
|
|
replicas = io_opts->data_replicas;
|
|
|
|
break;
|
|
|
|
}
|
2017-03-17 14:18:50 +08:00
|
|
|
|
|
|
|
if (!nr_good || nr_good >= replicas)
|
|
|
|
return DATA_SKIP;
|
|
|
|
|
|
|
|
data_opts->target = 0;
|
2020-07-23 11:11:48 +08:00
|
|
|
data_opts->nr_replicas = 1;
|
2018-11-02 03:10:01 +08:00
|
|
|
data_opts->btree_insert_flags = 0;
|
2017-03-17 14:18:50 +08:00
|
|
|
return DATA_ADD_REPLICAS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
|
2018-11-02 03:10:01 +08:00
|
|
|
struct bkey_s_c k,
|
2017-03-17 14:18:50 +08:00
|
|
|
struct bch_io_opts *io_opts,
|
|
|
|
struct data_opts *data_opts)
|
|
|
|
{
|
|
|
|
struct bch_ioctl_data *op = arg;
|
|
|
|
|
2018-11-02 03:10:01 +08:00
|
|
|
if (!bch2_bkey_has_device(k, op->migrate.dev))
|
2017-03-17 14:18:50 +08:00
|
|
|
return DATA_SKIP;
|
|
|
|
|
|
|
|
data_opts->target = 0;
|
2020-07-23 11:11:48 +08:00
|
|
|
data_opts->nr_replicas = 1;
|
2017-03-17 14:18:50 +08:00
|
|
|
data_opts->btree_insert_flags = 0;
|
|
|
|
data_opts->rewrite_dev = op->migrate.dev;
|
|
|
|
return DATA_REWRITE;
|
|
|
|
}
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
|
|
|
|
struct btree *b,
|
|
|
|
struct bch_io_opts *io_opts,
|
|
|
|
struct data_opts *data_opts)
|
|
|
|
{
|
|
|
|
return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
|
|
|
|
struct btree *b,
|
|
|
|
struct bch_io_opts *io_opts,
|
|
|
|
struct data_opts *data_opts)
|
|
|
|
{
|
|
|
|
return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
|
|
}
|
|
|
|
|
2021-03-21 11:55:36 +08:00
|
|
|
static bool bformat_needs_redo(struct bkey_format *f)
|
|
|
|
{
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < f->nr_fields; i++) {
|
|
|
|
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
|
|
|
u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
|
|
|
|
u64 field_offset = le64_to_cpu(f->field_offset[i]);
|
|
|
|
|
|
|
|
if (f->bits_per_field[i] > unpacked_bits)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
|
|
|
|
unpacked_mask) <
|
|
|
|
field_offset)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
|
|
|
|
struct btree *b,
|
|
|
|
struct bch_io_opts *io_opts,
|
|
|
|
struct data_opts *data_opts)
|
|
|
|
{
|
|
|
|
if (b->version_ondisk != c->sb.version ||
|
2021-03-21 11:55:36 +08:00
|
|
|
btree_node_need_rewrite(b) ||
|
|
|
|
bformat_needs_redo(&b->format)) {
|
2021-03-15 07:01:14 +08:00
|
|
|
data_opts->target = 0;
|
|
|
|
data_opts->nr_replicas = 1;
|
|
|
|
data_opts->btree_insert_flags = 0;
|
|
|
|
return DATA_REWRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return DATA_SKIP;
|
|
|
|
}
|
|
|
|
|
2021-03-23 06:39:16 +08:00
|
|
|
int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = bch2_move_btree(c,
|
|
|
|
0, POS_MIN,
|
|
|
|
BTREE_ID_NR, POS_MAX,
|
|
|
|
rewrite_old_nodes_pred, c, stats);
|
|
|
|
if (!ret) {
|
|
|
|
mutex_lock(&c->sb_lock);
|
2021-04-05 09:57:35 +08:00
|
|
|
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
|
|
|
|
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
|
2021-03-23 06:39:16 +08:00
|
|
|
c->disk_sb.sb->version_min = c->disk_sb.sb->version;
|
|
|
|
bch2_write_super(c);
|
|
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-03-17 14:18:50 +08:00
|
|
|
int bch2_data_job(struct bch_fs *c,
|
|
|
|
struct bch_move_stats *stats,
|
|
|
|
struct bch_ioctl_data op)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
switch (op.op) {
|
|
|
|
case BCH_DATA_OP_REREPLICATE:
|
2020-07-10 06:28:11 +08:00
|
|
|
stats->data_type = BCH_DATA_journal;
|
2017-03-17 14:18:50 +08:00
|
|
|
ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
ret = bch2_move_btree(c,
|
|
|
|
op.start_btree, op.start_pos,
|
|
|
|
op.end_btree, op.end_pos,
|
|
|
|
rereplicate_btree_pred, c, stats) ?: ret;
|
2019-01-22 04:32:13 +08:00
|
|
|
|
2020-05-26 02:57:06 +08:00
|
|
|
closure_wait_event(&c->btree_interior_update_wait,
|
|
|
|
!bch2_btree_interior_updates_nr_pending(c));
|
2019-01-22 04:32:13 +08:00
|
|
|
|
2019-05-01 05:15:39 +08:00
|
|
|
ret = bch2_replicas_gc2(c) ?: ret;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
ret = bch2_move_data(c,
|
|
|
|
op.start_btree, op.start_pos,
|
|
|
|
op.end_btree, op.end_pos,
|
|
|
|
NULL, writepoint_hashed((unsigned long) current),
|
2017-03-17 14:18:50 +08:00
|
|
|
rereplicate_pred, c, stats) ?: ret;
|
2019-05-01 05:15:39 +08:00
|
|
|
ret = bch2_replicas_gc2(c) ?: ret;
|
2017-03-17 14:18:50 +08:00
|
|
|
break;
|
|
|
|
case BCH_DATA_OP_MIGRATE:
|
|
|
|
if (op.migrate.dev >= c->sb.nr_devices)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2020-07-10 06:28:11 +08:00
|
|
|
stats->data_type = BCH_DATA_journal;
|
2017-03-17 14:18:50 +08:00
|
|
|
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
|
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
ret = bch2_move_btree(c,
|
|
|
|
op.start_btree, op.start_pos,
|
|
|
|
op.end_btree, op.end_pos,
|
|
|
|
migrate_btree_pred, &op, stats) ?: ret;
|
2019-05-01 05:15:39 +08:00
|
|
|
ret = bch2_replicas_gc2(c) ?: ret;
|
2017-03-17 14:18:50 +08:00
|
|
|
|
2021-03-15 07:01:14 +08:00
|
|
|
ret = bch2_move_data(c,
|
|
|
|
op.start_btree, op.start_pos,
|
|
|
|
op.end_btree, op.end_pos,
|
|
|
|
NULL, writepoint_hashed((unsigned long) current),
|
2017-03-17 14:18:50 +08:00
|
|
|
migrate_pred, &op, stats) ?: ret;
|
2019-05-01 05:15:39 +08:00
|
|
|
ret = bch2_replicas_gc2(c) ?: ret;
|
2017-03-17 14:18:50 +08:00
|
|
|
break;
|
2021-03-15 07:01:14 +08:00
|
|
|
case BCH_DATA_OP_REWRITE_OLD_NODES:
|
2021-03-23 06:39:16 +08:00
|
|
|
ret = bch2_scan_old_btree_nodes(c, stats);
|
2021-03-15 07:01:14 +08:00
|
|
|
break;
|
2017-03-17 14:18:50 +08:00
|
|
|
default:
|
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|