linux/net/ceph/osdmap.c
Ilya Dryomov 117d96a04f libceph: support for balanced and localized reads
OSD-side issues with reads from replica have been resolved in
Octopus.  Reading from replica should be safe wrt. unstable or
uncommitted state now, so add support for balanced and localized
reads.

There are two cases when a read from replica can't be served:

- OSD may silently drop the request, expecting the client to
  notice that the acting set has changed and resend via the usual
  means (handled with t->used_replica)

- OSD may return EAGAIN, expecting the client to resend to the
  primary, ignoring replica read flags (see handle_reply())

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
2020-06-01 13:22:53 +02:00

2936 lines
67 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/ceph/ceph_debug.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/osdmap.h>
#include <linux/ceph/decode.h>
#include <linux/crush/hash.h>
#include <linux/crush/mapper.h>
char *ceph_osdmap_state_str(char *str, int len, u32 state)
{
if (!len)
return str;
if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP))
snprintf(str, len, "exists, up");
else if (state & CEPH_OSD_EXISTS)
snprintf(str, len, "exists");
else if (state & CEPH_OSD_UP)
snprintf(str, len, "up");
else
snprintf(str, len, "doesn't exist");
return str;
}
/* maps */
static int calc_bits_of(unsigned int t)
{
int b = 0;
while (t) {
t = t >> 1;
b++;
}
return b;
}
/*
* the foo_mask is the smallest value 2^n-1 that is >= foo.
*/
static void calc_pg_masks(struct ceph_pg_pool_info *pi)
{
pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1;
pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1;
}
/*
* decode crush map
*/
static int crush_decode_uniform_bucket(void **p, void *end,
struct crush_bucket_uniform *b)
{
dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
b->item_weight = ceph_decode_32(p);
return 0;
bad:
return -EINVAL;
}
static int crush_decode_list_bucket(void **p, void *end,
struct crush_bucket_list *b)
{
int j;
dout("crush_decode_list_bucket %p to %p\n", *p, end);
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->item_weights == NULL)
return -ENOMEM;
b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->sum_weights == NULL)
return -ENOMEM;
ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
for (j = 0; j < b->h.size; j++) {
b->item_weights[j] = ceph_decode_32(p);
b->sum_weights[j] = ceph_decode_32(p);
}
return 0;
bad:
return -EINVAL;
}
static int crush_decode_tree_bucket(void **p, void *end,
struct crush_bucket_tree *b)
{
int j;
dout("crush_decode_tree_bucket %p to %p\n", *p, end);
ceph_decode_8_safe(p, end, b->num_nodes, bad);
b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
if (b->node_weights == NULL)
return -ENOMEM;
ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
for (j = 0; j < b->num_nodes; j++)
b->node_weights[j] = ceph_decode_32(p);
return 0;
bad:
return -EINVAL;
}
static int crush_decode_straw_bucket(void **p, void *end,
struct crush_bucket_straw *b)
{
int j;
dout("crush_decode_straw_bucket %p to %p\n", *p, end);
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->item_weights == NULL)
return -ENOMEM;
b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->straws == NULL)
return -ENOMEM;
ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
for (j = 0; j < b->h.size; j++) {
b->item_weights[j] = ceph_decode_32(p);
b->straws[j] = ceph_decode_32(p);
}
return 0;
bad:
return -EINVAL;
}
static int crush_decode_straw2_bucket(void **p, void *end,
struct crush_bucket_straw2 *b)
{
int j;
dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
if (b->item_weights == NULL)
return -ENOMEM;
ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
for (j = 0; j < b->h.size; j++)
b->item_weights[j] = ceph_decode_32(p);
return 0;
bad:
return -EINVAL;
}
struct crush_name_node {
struct rb_node cn_node;
int cn_id;
char cn_name[];
};
static struct crush_name_node *alloc_crush_name(size_t name_len)
{
struct crush_name_node *cn;
cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
if (!cn)
return NULL;
RB_CLEAR_NODE(&cn->cn_node);
return cn;
}
static void free_crush_name(struct crush_name_node *cn)
{
WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
kfree(cn);
}
DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
static int decode_crush_names(void **p, void *end, struct rb_root *root)
{
u32 n;
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
struct crush_name_node *cn;
int id;
u32 name_len;
ceph_decode_32_safe(p, end, id, e_inval);
ceph_decode_32_safe(p, end, name_len, e_inval);
ceph_decode_need(p, end, name_len, e_inval);
cn = alloc_crush_name(name_len);
if (!cn)
return -ENOMEM;
cn->cn_id = id;
memcpy(cn->cn_name, *p, name_len);
cn->cn_name[name_len] = '\0';
*p += name_len;
if (!__insert_crush_name(root, cn)) {
free_crush_name(cn);
return -EEXIST;
}
}
return 0;
e_inval:
return -EINVAL;
}
void clear_crush_names(struct rb_root *root)
{
while (!RB_EMPTY_ROOT(root)) {
struct crush_name_node *cn =
rb_entry(rb_first(root), struct crush_name_node, cn_node);
erase_crush_name(root, cn);
free_crush_name(cn);
}
}
static struct crush_choose_arg_map *alloc_choose_arg_map(void)
{
struct crush_choose_arg_map *arg_map;
arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
if (!arg_map)
return NULL;
RB_CLEAR_NODE(&arg_map->node);
return arg_map;
}
static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
{
if (arg_map) {
int i, j;
WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
for (i = 0; i < arg_map->size; i++) {
struct crush_choose_arg *arg = &arg_map->args[i];
for (j = 0; j < arg->weight_set_size; j++)
kfree(arg->weight_set[j].weights);
kfree(arg->weight_set);
kfree(arg->ids);
}
kfree(arg_map->args);
kfree(arg_map);
}
}
DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
node);
void clear_choose_args(struct crush_map *c)
{
while (!RB_EMPTY_ROOT(&c->choose_args)) {
struct crush_choose_arg_map *arg_map =
rb_entry(rb_first(&c->choose_args),
struct crush_choose_arg_map, node);
erase_choose_arg_map(&c->choose_args, arg_map);
free_choose_arg_map(arg_map);
}
}
static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
{
u32 *a = NULL;
u32 len;
int ret;
ceph_decode_32_safe(p, end, len, e_inval);
if (len) {
u32 i;
a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
if (!a) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_need(p, end, len * sizeof(u32), e_inval);
for (i = 0; i < len; i++)
a[i] = ceph_decode_32(p);
}
*plen = len;
return a;
e_inval:
ret = -EINVAL;
fail:
kfree(a);
return ERR_PTR(ret);
}
/*
* Assumes @arg is zero-initialized.
*/
static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
{
int ret;
ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
if (arg->weight_set_size) {
u32 i;
arg->weight_set = kmalloc_array(arg->weight_set_size,
sizeof(*arg->weight_set),
GFP_NOIO);
if (!arg->weight_set)
return -ENOMEM;
for (i = 0; i < arg->weight_set_size; i++) {
struct crush_weight_set *w = &arg->weight_set[i];
w->weights = decode_array_32_alloc(p, end, &w->size);
if (IS_ERR(w->weights)) {
ret = PTR_ERR(w->weights);
w->weights = NULL;
return ret;
}
}
}
arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
if (IS_ERR(arg->ids)) {
ret = PTR_ERR(arg->ids);
arg->ids = NULL;
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static int decode_choose_args(void **p, void *end, struct crush_map *c)
{
struct crush_choose_arg_map *arg_map = NULL;
u32 num_choose_arg_maps, num_buckets;
int ret;
ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
while (num_choose_arg_maps--) {
arg_map = alloc_choose_arg_map();
if (!arg_map) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_64_safe(p, end, arg_map->choose_args_index,
e_inval);
arg_map->size = c->max_buckets;
arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
GFP_NOIO);
if (!arg_map->args) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_32_safe(p, end, num_buckets, e_inval);
while (num_buckets--) {
struct crush_choose_arg *arg;
u32 bucket_index;
ceph_decode_32_safe(p, end, bucket_index, e_inval);
if (bucket_index >= arg_map->size)
goto e_inval;
arg = &arg_map->args[bucket_index];
ret = decode_choose_arg(p, end, arg);
if (ret)
goto fail;
if (arg->ids_size &&
arg->ids_size != c->buckets[bucket_index]->size)
goto e_inval;
}
insert_choose_arg_map(&c->choose_args, arg_map);
}
return 0;
e_inval:
ret = -EINVAL;
fail:
free_choose_arg_map(arg_map);
return ret;
}
static void crush_finalize(struct crush_map *c)
{
__s32 b;
/* Space for the array of pointers to per-bucket workspace */
c->working_size = sizeof(struct crush_work) +
c->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < c->max_buckets; b++) {
if (!c->buckets[b])
continue;
switch (c->buckets[b]->alg) {
default:
/*
* The base case, permutation variables and
* the pointer to the permutation array.
*/
c->working_size += sizeof(struct crush_work_bucket);
break;
}
/* Every bucket has a permutation array. */
c->working_size += c->buckets[b]->size * sizeof(__u32);
}
}
static struct crush_map *crush_decode(void *pbyval, void *end)
{
struct crush_map *c;
int err;
int i, j;
void **p = &pbyval;
void *start = pbyval;
u32 magic;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
c = kzalloc(sizeof(*c), GFP_NOFS);
if (c == NULL)
return ERR_PTR(-ENOMEM);
c->type_names = RB_ROOT;
c->names = RB_ROOT;
c->choose_args = RB_ROOT;
/* set tunables to default values */
c->choose_local_tries = 2;
c->choose_local_fallback_tries = 5;
c->choose_total_tries = 19;
c->chooseleaf_descend_once = 0;
ceph_decode_need(p, end, 4*sizeof(u32), bad);
magic = ceph_decode_32(p);
if (magic != CRUSH_MAGIC) {
pr_err("crush_decode magic %x != current %x\n",
(unsigned int)magic, (unsigned int)CRUSH_MAGIC);
goto bad;
}
c->max_buckets = ceph_decode_32(p);
c->max_rules = ceph_decode_32(p);
c->max_devices = ceph_decode_32(p);
c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
if (c->buckets == NULL)
goto badmem;
c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
if (c->rules == NULL)
goto badmem;
/* buckets */
for (i = 0; i < c->max_buckets; i++) {
int size = 0;
u32 alg;
struct crush_bucket *b;
ceph_decode_32_safe(p, end, alg, bad);
if (alg == 0) {
c->buckets[i] = NULL;
continue;
}
dout("crush_decode bucket %d off %x %p to %p\n",
i, (int)(*p-start), *p, end);
switch (alg) {
case CRUSH_BUCKET_UNIFORM:
size = sizeof(struct crush_bucket_uniform);
break;
case CRUSH_BUCKET_LIST:
size = sizeof(struct crush_bucket_list);
break;
case CRUSH_BUCKET_TREE:
size = sizeof(struct crush_bucket_tree);
break;
case CRUSH_BUCKET_STRAW:
size = sizeof(struct crush_bucket_straw);
break;
case CRUSH_BUCKET_STRAW2:
size = sizeof(struct crush_bucket_straw2);
break;
default:
goto bad;
}
BUG_ON(size == 0);
b = c->buckets[i] = kzalloc(size, GFP_NOFS);
if (b == NULL)
goto badmem;
ceph_decode_need(p, end, 4*sizeof(u32), bad);
b->id = ceph_decode_32(p);
b->type = ceph_decode_16(p);
b->alg = ceph_decode_8(p);
b->hash = ceph_decode_8(p);
b->weight = ceph_decode_32(p);
b->size = ceph_decode_32(p);
dout("crush_decode bucket size %d off %x %p to %p\n",
b->size, (int)(*p-start), *p, end);
b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
if (b->items == NULL)
goto badmem;
ceph_decode_need(p, end, b->size*sizeof(u32), bad);
for (j = 0; j < b->size; j++)
b->items[j] = ceph_decode_32(p);
switch (b->alg) {
case CRUSH_BUCKET_UNIFORM:
err = crush_decode_uniform_bucket(p, end,
(struct crush_bucket_uniform *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_LIST:
err = crush_decode_list_bucket(p, end,
(struct crush_bucket_list *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_TREE:
err = crush_decode_tree_bucket(p, end,
(struct crush_bucket_tree *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_STRAW:
err = crush_decode_straw_bucket(p, end,
(struct crush_bucket_straw *)b);
if (err < 0)
goto fail;
break;
case CRUSH_BUCKET_STRAW2:
err = crush_decode_straw2_bucket(p, end,
(struct crush_bucket_straw2 *)b);
if (err < 0)
goto fail;
break;
}
}
/* rules */
dout("rule vec is %p\n", c->rules);
for (i = 0; i < c->max_rules; i++) {
u32 yes;
struct crush_rule *r;
ceph_decode_32_safe(p, end, yes, bad);
if (!yes) {
dout("crush_decode NO rule %d off %x %p to %p\n",
i, (int)(*p-start), *p, end);
c->rules[i] = NULL;
continue;
}
dout("crush_decode rule %d off %x %p to %p\n",
i, (int)(*p-start), *p, end);
/* len */
ceph_decode_32_safe(p, end, yes, bad);
#if BITS_PER_LONG == 32
if (yes > (ULONG_MAX - sizeof(*r))
/ sizeof(struct crush_rule_step))
goto bad;
#endif
r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
c->rules[i] = r;
if (r == NULL)
goto badmem;
dout(" rule %d is at %p\n", i, r);
r->len = yes;
ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
for (j = 0; j < r->len; j++) {
r->steps[j].op = ceph_decode_32(p);
r->steps[j].arg1 = ceph_decode_32(p);
r->steps[j].arg2 = ceph_decode_32(p);
}
}
err = decode_crush_names(p, end, &c->type_names);
if (err)
goto fail;
err = decode_crush_names(p, end, &c->names);
if (err)
goto fail;
ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
/* tunables */
ceph_decode_need(p, end, 3*sizeof(u32), done);
c->choose_local_tries = ceph_decode_32(p);
c->choose_local_fallback_tries = ceph_decode_32(p);
c->choose_total_tries = ceph_decode_32(p);
dout("crush decode tunable choose_local_tries = %d\n",
c->choose_local_tries);
dout("crush decode tunable choose_local_fallback_tries = %d\n",
c->choose_local_fallback_tries);
dout("crush decode tunable choose_total_tries = %d\n",
c->choose_total_tries);
ceph_decode_need(p, end, sizeof(u32), done);
c->chooseleaf_descend_once = ceph_decode_32(p);
dout("crush decode tunable chooseleaf_descend_once = %d\n",
c->chooseleaf_descend_once);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_vary_r = ceph_decode_8(p);
dout("crush decode tunable chooseleaf_vary_r = %d\n",
c->chooseleaf_vary_r);
/* skip straw_calc_version, allowed_bucket_algs */
ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
*p += sizeof(u8) + sizeof(u32);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_stable = ceph_decode_8(p);
dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable);
if (*p != end) {
/* class_map */
ceph_decode_skip_map(p, end, 32, 32, bad);
/* class_name */
ceph_decode_skip_map(p, end, 32, string, bad);
/* class_bucket */
ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
}
if (*p != end) {
err = decode_choose_args(p, end, c);
if (err)
goto fail;
}
done:
crush_finalize(c);
dout("crush_decode success\n");
return c;
badmem:
err = -ENOMEM;
fail:
dout("crush_decode fail %d\n", err);
crush_destroy(c);
return ERR_PTR(err);
bad:
err = -EINVAL;
goto fail;
}
int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
{
if (lhs->pool < rhs->pool)
return -1;
if (lhs->pool > rhs->pool)
return 1;
if (lhs->seed < rhs->seed)
return -1;
if (lhs->seed > rhs->seed)
return 1;
return 0;
}
int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
{
int ret;
ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
if (ret)
return ret;
if (lhs->shard < rhs->shard)
return -1;
if (lhs->shard > rhs->shard)
return 1;
return 0;
}
static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
{
struct ceph_pg_mapping *pg;
pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
if (!pg)
return NULL;
RB_CLEAR_NODE(&pg->node);
return pg;
}
static void free_pg_mapping(struct ceph_pg_mapping *pg)
{
WARN_ON(!RB_EMPTY_NODE(&pg->node));
kfree(pg);
}
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
*/
DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
RB_BYPTR, const struct ceph_pg *, node)
/*
* rbtree of pg pool info
*/
DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
{
return lookup_pg_pool(&map->pg_pools, id);
}
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
{
struct ceph_pg_pool_info *pi;
if (id == CEPH_NOPOOL)
return NULL;
if (WARN_ON_ONCE(id > (u64) INT_MAX))
return NULL;
pi = lookup_pg_pool(&map->pg_pools, id);
return pi ? pi->name : NULL;
}
EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
{
struct rb_node *rbp;
for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
struct ceph_pg_pool_info *pi =
rb_entry(rbp, struct ceph_pg_pool_info, node);
if (pi->name && strcmp(pi->name, name) == 0)
return pi->id;
}
return -ENOENT;
}
EXPORT_SYMBOL(ceph_pg_poolid_by_name);
u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
{
struct ceph_pg_pool_info *pi;
pi = lookup_pg_pool(&map->pg_pools, id);
return pi ? pi->flags : 0;
}
EXPORT_SYMBOL(ceph_pg_pool_flags);
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
{
erase_pg_pool(root, pi);
kfree(pi->name);
kfree(pi);
}
static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
{
u8 ev, cv;
unsigned len, num;
void *pool_end;
ceph_decode_need(p, end, 2 + 4, bad);
ev = ceph_decode_8(p); /* encoding version */
cv = ceph_decode_8(p); /* compat version */
if (ev < 5) {
pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
if (cv > 9) {
pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
len = ceph_decode_32(p);
ceph_decode_need(p, end, len, bad);
pool_end = *p + len;
pi->type = ceph_decode_8(p);
pi->size = ceph_decode_8(p);
pi->crush_ruleset = ceph_decode_8(p);
pi->object_hash = ceph_decode_8(p);
pi->pg_num = ceph_decode_32(p);
pi->pgp_num = ceph_decode_32(p);
*p += 4 + 4; /* skip lpg* */
*p += 4; /* skip last_change */
*p += 8 + 4; /* skip snap_seq, snap_epoch */
/* skip snaps */
num = ceph_decode_32(p);
while (num--) {
*p += 8; /* snapid key */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
}
/* skip removed_snaps */
num = ceph_decode_32(p);
*p += num * (8 + 8);
*p += 8; /* skip auid */
pi->flags = ceph_decode_64(p);
*p += 4; /* skip crash_replay_interval */
if (ev >= 7)
pi->min_size = ceph_decode_8(p);
else
pi->min_size = pi->size - pi->size / 2;
if (ev >= 8)
*p += 8 + 8; /* skip quota_max_* */
if (ev >= 9) {
/* skip tiers */
num = ceph_decode_32(p);
*p += num * 8;
*p += 8; /* skip tier_of */
*p += 1; /* skip cache_mode */
pi->read_tier = ceph_decode_64(p);
pi->write_tier = ceph_decode_64(p);
} else {
pi->read_tier = -1;
pi->write_tier = -1;
}
if (ev >= 10) {
/* skip properties */
num = ceph_decode_32(p);
while (num--) {
len = ceph_decode_32(p);
*p += len; /* key */
len = ceph_decode_32(p);
*p += len; /* val */
}
}
if (ev >= 11) {
/* skip hit_set_params */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
*p += 4; /* skip hit_set_period */
*p += 4; /* skip hit_set_count */
}
if (ev >= 12)
*p += 4; /* skip stripe_width */
if (ev >= 13) {
*p += 8; /* skip target_max_bytes */
*p += 8; /* skip target_max_objects */
*p += 4; /* skip cache_target_dirty_ratio_micro */
*p += 4; /* skip cache_target_full_ratio_micro */
*p += 4; /* skip cache_min_flush_age */
*p += 4; /* skip cache_min_evict_age */
}
if (ev >= 14) {
/* skip erasure_code_profile */
len = ceph_decode_32(p);
*p += len;
}
/*
* last_force_op_resend_preluminous, will be overridden if the
* map was encoded with RESEND_ON_SPLIT
*/
if (ev >= 15)
pi->last_force_request_resend = ceph_decode_32(p);
else
pi->last_force_request_resend = 0;
if (ev >= 16)
*p += 4; /* skip min_read_recency_for_promote */
if (ev >= 17)
*p += 8; /* skip expected_num_objects */
if (ev >= 19)
*p += 4; /* skip cache_target_dirty_high_ratio_micro */
if (ev >= 20)
*p += 4; /* skip min_write_recency_for_promote */
if (ev >= 21)
*p += 1; /* skip use_gmt_hitset */
if (ev >= 22)
*p += 1; /* skip fast_read */
if (ev >= 23) {
*p += 4; /* skip hit_set_grade_decay_rate */
*p += 4; /* skip hit_set_search_last_n */
}
if (ev >= 24) {
/* skip opts */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
}
if (ev >= 25)
pi->last_force_request_resend = ceph_decode_32(p);
/* ignore the rest */
*p = pool_end;
calc_pg_masks(pi);
return 0;
bad:
return -EINVAL;
}
static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
{
struct ceph_pg_pool_info *pi;
u32 num, len;
u64 pool;
ceph_decode_32_safe(p, end, num, bad);
dout(" %d pool names\n", num);
while (num--) {
ceph_decode_64_safe(p, end, pool, bad);
ceph_decode_32_safe(p, end, len, bad);
dout(" pool %llu len %d\n", pool, len);
ceph_decode_need(p, end, len, bad);
pi = lookup_pg_pool(&map->pg_pools, pool);
if (pi) {
char *name = kstrndup(*p, len, GFP_NOFS);
if (!name)
return -ENOMEM;
kfree(pi->name);
pi->name = name;
dout(" name is %s\n", pi->name);
}
*p += len;
}
return 0;
bad:
return -EINVAL;
}
/*
* osd map
*/
struct ceph_osdmap *ceph_osdmap_alloc(void)
{
struct ceph_osdmap *map;
map = kzalloc(sizeof(*map), GFP_NOIO);
if (!map)
return NULL;
map->pg_pools = RB_ROOT;
map->pool_max = -1;
map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT;
map->pg_upmap = RB_ROOT;
map->pg_upmap_items = RB_ROOT;
mutex_init(&map->crush_workspace_mutex);
return map;
}
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
if (map->crush)
crush_destroy(map->crush);
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp),
struct ceph_pg_mapping, node);
erase_pg_mapping(&map->pg_temp, pg);
free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->primary_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->primary_temp),
struct ceph_pg_mapping, node);
erase_pg_mapping(&map->primary_temp, pg);
free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_upmap),
struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_upmap);
kfree(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_upmap_items),
struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_upmap_items);
kfree(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
struct ceph_pg_pool_info *pi =
rb_entry(rb_first(&map->pg_pools),
struct ceph_pg_pool_info, node);
__remove_pg_pool(&map->pg_pools, pi);
}
kvfree(map->osd_state);
kvfree(map->osd_weight);
kvfree(map->osd_addr);
kvfree(map->osd_primary_affinity);
kvfree(map->crush_workspace);
kfree(map);
}
/*
* Adjust max_osd value, (re)allocate arrays.
*
* The new elements are properly initialized.
*/
static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
{
u32 *state;
u32 *weight;
struct ceph_entity_addr *addr;
u32 to_copy;
int i;
dout("%s old %u new %u\n", __func__, map->max_osd, max);
if (max == map->max_osd)
return 0;
state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
if (!state || !weight || !addr) {
kvfree(state);
kvfree(weight);
kvfree(addr);
return -ENOMEM;
}
to_copy = min(map->max_osd, max);
if (map->osd_state) {
memcpy(state, map->osd_state, to_copy * sizeof(*state));
memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
kvfree(map->osd_state);
kvfree(map->osd_weight);
kvfree(map->osd_addr);
}
map->osd_state = state;
map->osd_weight = weight;
map->osd_addr = addr;
for (i = map->max_osd; i < max; i++) {
map->osd_state[i] = 0;
map->osd_weight[i] = CEPH_OSD_OUT;
memset(map->osd_addr + i, 0, sizeof(*map->osd_addr));
}
if (map->osd_primary_affinity) {
u32 *affinity;
affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
GFP_NOFS);
if (!affinity)
return -ENOMEM;
memcpy(affinity, map->osd_primary_affinity,
to_copy * sizeof(*affinity));
kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = affinity;
for (i = map->max_osd; i < max; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
}
map->max_osd = max;
return 0;
}
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
void *workspace;
size_t work_size;
if (IS_ERR(crush))
return PTR_ERR(crush);
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size);
workspace = ceph_kvmalloc(work_size, GFP_NOIO);
if (!workspace) {
crush_destroy(crush);
return -ENOMEM;
}
crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
kvfree(map->crush_workspace);
map->crush = crush;
map->crush_workspace = workspace;
return 0;
}
#define OSDMAP_WRAPPER_COMPAT_VER 7
#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
/*
* Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
* to struct_v of the client_data section for new (v7 and above)
* osdmaps.
*/
static int get_osdmap_client_data_v(void **p, void *end,
const char *prefix, u8 *v)
{
u8 struct_v;
ceph_decode_8_safe(p, end, struct_v, e_inval);
if (struct_v >= 7) {
u8 struct_compat;
ceph_decode_8_safe(p, end, struct_compat, e_inval);
if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n",
struct_v, struct_compat,
OSDMAP_WRAPPER_COMPAT_VER, prefix);
return -EINVAL;
}
*p += 4; /* ignore wrapper struct_len */
ceph_decode_8_safe(p, end, struct_v, e_inval);
ceph_decode_8_safe(p, end, struct_compat, e_inval);
if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n",
struct_v, struct_compat,
OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
return -EINVAL;
}
*p += 4; /* ignore client data struct_len */
} else {
u16 version;
*p -= 1;
ceph_decode_16_safe(p, end, version, e_inval);
if (version < 6) {
pr_warn("got v %d < 6 of %s ceph_osdmap\n",
version, prefix);
return -EINVAL;
}
/* old osdmap enconding */
struct_v = 0;
}
*v = struct_v;
return 0;
e_inval:
return -EINVAL;
}
static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
bool incremental)
{
u32 n;
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
struct ceph_pg_pool_info *pi;
u64 pool;
int ret;
ceph_decode_64_safe(p, end, pool, e_inval);
pi = lookup_pg_pool(&map->pg_pools, pool);
if (!incremental || !pi) {
pi = kzalloc(sizeof(*pi), GFP_NOFS);
if (!pi)
return -ENOMEM;
RB_CLEAR_NODE(&pi->node);
pi->id = pool;
if (!__insert_pg_pool(&map->pg_pools, pi)) {
kfree(pi);
return -EEXIST;
}
}
ret = decode_pool(p, end, pi);
if (ret)
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
{
return __decode_pools(p, end, map, false);
}
static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
{
return __decode_pools(p, end, map, true);
}
typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
decode_mapping_fn_t fn, bool incremental)
{
u32 n;
WARN_ON(!incremental && !fn);
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
struct ceph_pg_mapping *pg;
struct ceph_pg pgid;
int ret;
ret = ceph_decode_pgid(p, end, &pgid);
if (ret)
return ret;
pg = lookup_pg_mapping(mapping_root, &pgid);
if (pg) {
WARN_ON(!incremental);
erase_pg_mapping(mapping_root, pg);
free_pg_mapping(pg);
}
if (fn) {
pg = fn(p, end, incremental);
if (IS_ERR(pg))
return PTR_ERR(pg);
if (pg) {
pg->pgid = pgid; /* struct */
insert_pg_mapping(mapping_root, pg);
}
}
}
return 0;
e_inval:
return -EINVAL;
}
static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
bool incremental)
{
struct ceph_pg_mapping *pg;
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0 && incremental)
return NULL; /* new_pg_temp: [] to remove */
if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
return ERR_PTR(-EINVAL);
ceph_decode_need(p, end, len * sizeof(u32), e_inval);
pg = alloc_pg_mapping(len * sizeof(u32));
if (!pg)
return ERR_PTR(-ENOMEM);
pg->pg_temp.len = len;
for (i = 0; i < len; i++)
pg->pg_temp.osds[i] = ceph_decode_32(p);
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
false);
}
static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
true);
}
static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
bool incremental)
{
struct ceph_pg_mapping *pg;
u32 osd;
ceph_decode_32_safe(p, end, osd, e_inval);
if (osd == (u32)-1 && incremental)
return NULL; /* new_primary_temp: -1 to remove */
pg = alloc_pg_mapping(0);
if (!pg)
return ERR_PTR(-ENOMEM);
pg->primary_temp.osd = osd;
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->primary_temp,
__decode_primary_temp, false);
}
static int decode_new_primary_temp(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->primary_temp,
__decode_primary_temp, true);
}
u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
{
BUG_ON(osd >= map->max_osd);
if (!map->osd_primary_affinity)
return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
return map->osd_primary_affinity[osd];
}
static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
{
BUG_ON(osd >= map->max_osd);
if (!map->osd_primary_affinity) {
int i;
map->osd_primary_affinity = ceph_kvmalloc(
array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
GFP_NOFS);
if (!map->osd_primary_affinity)
return -ENOMEM;
for (i = 0; i < map->max_osd; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
}
map->osd_primary_affinity[osd] = aff;
return 0;
}
static int decode_primary_affinity(void **p, void *end,
struct ceph_osdmap *map)
{
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0) {
kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL;
return 0;
}
if (len != map->max_osd)
goto e_inval;
ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
for (i = 0; i < map->max_osd; i++) {
int ret;
ret = set_primary_affinity(map, i, ceph_decode_32(p));
if (ret)
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static int decode_new_primary_affinity(void **p, void *end,
struct ceph_osdmap *map)
{
u32 n;
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
u32 osd, aff;
int ret;
ceph_decode_32_safe(p, end, osd, e_inval);
ceph_decode_32_safe(p, end, aff, e_inval);
ret = set_primary_affinity(map, osd, aff);
if (ret)
return ret;
pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
}
return 0;
e_inval:
return -EINVAL;
}
static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
bool __unused)
{
return __decode_pg_temp(p, end, false);
}
static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
false);
}
static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
true);
}
static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
}
static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
bool __unused)
{
struct ceph_pg_mapping *pg;
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
return ERR_PTR(-EINVAL);
ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
pg = alloc_pg_mapping(2 * len * sizeof(u32));
if (!pg)
return ERR_PTR(-ENOMEM);
pg->pg_upmap_items.len = len;
for (i = 0; i < len; i++) {
pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
}
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items,
__decode_pg_upmap_items, false);
}
static int decode_new_pg_upmap_items(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items,
__decode_pg_upmap_items, true);
}
static int decode_old_pg_upmap_items(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
}
/*
* decode a full map.
*/
static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
{
u8 struct_v;
u32 epoch = 0;
void *start = *p;
u32 max;
u32 len, i;
int err;
dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
err = get_osdmap_client_data_v(p, end, "full", &struct_v);
if (err)
goto bad;
/* fsid, epoch, created, modified */
ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
sizeof(map->created) + sizeof(map->modified), e_inval);
ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
epoch = map->epoch = ceph_decode_32(p);
ceph_decode_copy(p, &map->created, sizeof(map->created));
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
/* pools */
err = decode_pools(p, end, map);
if (err)
goto bad;
/* pool_name */
err = decode_pool_names(p, end, map);
if (err)
goto bad;
ceph_decode_32_safe(p, end, map->pool_max, e_inval);
ceph_decode_32_safe(p, end, map->flags, e_inval);
/* max_osd */
ceph_decode_32_safe(p, end, max, e_inval);
/* (re)alloc osd arrays */
err = osdmap_set_max_osd(map, max);
if (err)
goto bad;
/* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
map->max_osd*(struct_v >= 5 ? sizeof(u32) :
sizeof(u8)) +
sizeof(*map->osd_weight), e_inval);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
if (struct_v >= 5) {
for (i = 0; i < map->max_osd; i++)
map->osd_state[i] = ceph_decode_32(p);
} else {
for (i = 0; i < map->max_osd; i++)
map->osd_state[i] = ceph_decode_8(p);
}
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
for (i = 0; i < map->max_osd; i++)
map->osd_weight[i] = ceph_decode_32(p);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
for (i = 0; i < map->max_osd; i++) {
err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
if (err)
goto bad;
}
/* pg_temp */
err = decode_pg_temp(p, end, map);
if (err)
goto bad;
/* primary_temp */
if (struct_v >= 1) {
err = decode_primary_temp(p, end, map);
if (err)
goto bad;
}
/* primary_affinity */
if (struct_v >= 2) {
err = decode_primary_affinity(p, end, map);
if (err)
goto bad;
} else {
WARN_ON(map->osd_primary_affinity);
}
/* crush */
ceph_decode_32_safe(p, end, len, e_inval);
err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end)));
if (err)
goto bad;
*p += len;
if (struct_v >= 3) {
/* erasure_code_profiles */
ceph_decode_skip_map_of_map(p, end, string, string, string,
e_inval);
}
if (struct_v >= 4) {
err = decode_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_pg_upmap_items(p, end, map);
if (err)
goto bad;
} else {
WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
}
/* ignore the rest */
*p = end;
dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
return 0;
e_inval:
err = -EINVAL;
bad:
pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
err, epoch, (int)(*p - start), *p, start, end);
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
return err;
}
/*
* Allocate and decode a full map.
*/
struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
{
struct ceph_osdmap *map;
int ret;
map = ceph_osdmap_alloc();
if (!map)
return ERR_PTR(-ENOMEM);
ret = osdmap_decode(p, end, map);
if (ret) {
ceph_osdmap_destroy(map);
return ERR_PTR(ret);
}
return map;
}
/*
* Encoding order is (new_up_client, new_state, new_weight). Need to
* apply in the (new_weight, new_state, new_up_client) order, because
* an incremental map may look like e.g.
*
* new_up_client: { osd=6, addr=... } # set osd_state and addr
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_osdmap *map)
{
void *new_up_client;
void *new_state;
void *new_weight_end;
u32 len;
int i;
new_up_client = *p;
ceph_decode_32_safe(p, end, len, e_inval);
for (i = 0; i < len; ++i) {
struct ceph_entity_addr addr;
ceph_decode_skip_32(p, end, e_inval);
if (ceph_decode_entity_addr(p, end, &addr))
goto e_inval;
}
new_state = *p;
ceph_decode_32_safe(p, end, len, e_inval);
len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
ceph_decode_need(p, end, len, e_inval);
*p += len;
/* new_weight */
ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
s32 osd;
u32 w;
ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
osd = ceph_decode_32(p);
w = ceph_decode_32(p);
BUG_ON(osd >= map->max_osd);
pr_info("osd%d weight 0x%x %s\n", osd, w,
w == CEPH_OSD_IN ? "(in)" :
(w == CEPH_OSD_OUT ? "(out)" : ""));
map->osd_weight[osd] = w;
/*
* If we are marking in, set the EXISTS, and clear the
* AUTOOUT and NEW bits.
*/
if (w) {
map->osd_state[osd] |= CEPH_OSD_EXISTS;
map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
CEPH_OSD_NEW);
}
}
new_weight_end = *p;
/* new_state (up/down) */
*p = new_state;
len = ceph_decode_32(p);
while (len--) {
s32 osd;
u32 xorstate;
int ret;
osd = ceph_decode_32(p);
if (struct_v >= 5)
xorstate = ceph_decode_32(p);
else
xorstate = ceph_decode_8(p);
if (xorstate == 0)
xorstate = CEPH_OSD_UP;
BUG_ON(osd >= map->max_osd);
if ((map->osd_state[osd] & CEPH_OSD_UP) &&
(xorstate & CEPH_OSD_UP))
pr_info("osd%d down\n", osd);
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
(xorstate & CEPH_OSD_EXISTS)) {
pr_info("osd%d does not exist\n", osd);
ret = set_primary_affinity(map, osd,
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
if (ret)
return ret;
memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
map->osd_state[osd] = 0;
} else {
map->osd_state[osd] ^= xorstate;
}
}
/* new_up_client */
*p = new_up_client;
len = ceph_decode_32(p);
while (len--) {
s32 osd;
struct ceph_entity_addr addr;
osd = ceph_decode_32(p);
BUG_ON(osd >= map->max_osd);
if (ceph_decode_entity_addr(p, end, &addr))
goto e_inval;
pr_info("osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
}
*p = new_weight_end;
return 0;
e_inval:
return -EINVAL;
}
/*
* decode and apply an incremental map update.
*/
struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map)
{
struct ceph_fsid fsid;
u32 epoch = 0;
struct ceph_timespec modified;
s32 len;
u64 pool;
__s64 new_pool_max;
__s32 new_flags, max;
void *start = *p;
int err;
u8 struct_v;
dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
if (err)
goto bad;
/* fsid, epoch, modified, new_pool_max, new_flags */
ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
sizeof(u64) + sizeof(u32), e_inval);
ceph_decode_copy(p, &fsid, sizeof(fsid));
epoch = ceph_decode_32(p);
BUG_ON(epoch != map->epoch+1);
ceph_decode_copy(p, &modified, sizeof(modified));
new_pool_max = ceph_decode_64(p);
new_flags = ceph_decode_32(p);
/* full map? */
ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
return ceph_osdmap_decode(p, min(*p+len, end));
}
/* new crush? */
ceph_decode_32_safe(p, end, len, e_inval);
if (len > 0) {
err = osdmap_set_crush(map,
crush_decode(*p, min(*p + len, end)));
if (err)
goto bad;
*p += len;
}
/* new flags? */
if (new_flags >= 0)
map->flags = new_flags;
if (new_pool_max >= 0)
map->pool_max = new_pool_max;
/* new max? */
ceph_decode_32_safe(p, end, max, e_inval);
if (max >= 0) {
err = osdmap_set_max_osd(map, max);
if (err)
goto bad;
}
map->epoch++;
map->modified = modified;
/* new_pools */
err = decode_new_pools(p, end, map);
if (err)
goto bad;
/* new_pool_names */
err = decode_pool_names(p, end, map);
if (err)
goto bad;
/* old_pool */
ceph_decode_32_safe(p, end, len, e_inval);
while (len--) {
struct ceph_pg_pool_info *pi;
ceph_decode_64_safe(p, end, pool, e_inval);
pi = lookup_pg_pool(&map->pg_pools, pool);
if (pi)
__remove_pg_pool(&map->pg_pools, pi);
}
/* new_up_client, new_state, new_weight */
err = decode_new_up_state_weight(p, end, struct_v, map);
if (err)
goto bad;
/* new_pg_temp */
err = decode_new_pg_temp(p, end, map);
if (err)
goto bad;
/* new_primary_temp */
if (struct_v >= 1) {
err = decode_new_primary_temp(p, end, map);
if (err)
goto bad;
}
/* new_primary_affinity */
if (struct_v >= 2) {
err = decode_new_primary_affinity(p, end, map);
if (err)
goto bad;
}
if (struct_v >= 3) {
/* new_erasure_code_profiles */
ceph_decode_skip_map_of_map(p, end, string, string, string,
e_inval);
/* old_erasure_code_profiles */
ceph_decode_skip_set(p, end, string, e_inval);
}
if (struct_v >= 4) {
err = decode_new_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_old_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_new_pg_upmap_items(p, end, map);
if (err)
goto bad;
err = decode_old_pg_upmap_items(p, end, map);
if (err)
goto bad;
}
/* ignore the rest */
*p = end;
dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
return map;
e_inval:
err = -EINVAL;
bad:
pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
err, epoch, (int)(*p - start), *p, start, end);
print_hex_dump(KERN_DEBUG, "osdmap: ",
DUMP_PREFIX_OFFSET, 16, 1,
start, end - start, true);
return ERR_PTR(err);
}
void ceph_oloc_copy(struct ceph_object_locator *dest,
const struct ceph_object_locator *src)
{
ceph_oloc_destroy(dest);
dest->pool = src->pool;
if (src->pool_ns)
dest->pool_ns = ceph_get_string(src->pool_ns);
else
dest->pool_ns = NULL;
}
EXPORT_SYMBOL(ceph_oloc_copy);
void ceph_oloc_destroy(struct ceph_object_locator *oloc)
{
ceph_put_string(oloc->pool_ns);
}
EXPORT_SYMBOL(ceph_oloc_destroy);
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
ceph_oid_destroy(dest);
if (src->name != src->inline_name) {
/* very rare, see ceph_object_id definition */
dest->name = kmalloc(src->name_len + 1,
GFP_NOIO | __GFP_NOFAIL);
} else {
dest->name = dest->inline_name;
}
memcpy(dest->name, src->name, src->name_len + 1);
dest->name_len = src->name_len;
}
EXPORT_SYMBOL(ceph_oid_copy);
static __printf(2, 0)
int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
{
int len;
WARN_ON(!ceph_oid_empty(oid));
len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
if (len >= sizeof(oid->inline_name))
return len;
oid->name_len = len;
return 0;
}
/*
* If oid doesn't fit into inline buffer, BUG.
*/
void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
BUG_ON(oid_printf_vargs(oid, fmt, ap));
va_end(ap);
}
EXPORT_SYMBOL(ceph_oid_printf);
static __printf(3, 0)
int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, va_list ap)
{
va_list aq;
int len;
va_copy(aq, ap);
len = oid_printf_vargs(oid, fmt, aq);
va_end(aq);
if (len) {
char *external_name;
external_name = kmalloc(len + 1, gfp);
if (!external_name)
return -ENOMEM;
oid->name = external_name;
WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
oid->name_len = len;
}
return 0;
}
/*
* If oid doesn't fit into inline buffer, allocate.
*/
int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
const char *fmt, ...)
{
va_list ap;
int ret;
va_start(ap, fmt);
ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
va_end(ap);
return ret;
}
EXPORT_SYMBOL(ceph_oid_aprintf);
void ceph_oid_destroy(struct ceph_object_id *oid)
{
if (oid->name != oid->inline_name)
kfree(oid->name);
}
EXPORT_SYMBOL(ceph_oid_destroy);
/*
* osds only
*/
static bool __osds_equal(const struct ceph_osds *lhs,
const struct ceph_osds *rhs)
{
if (lhs->size == rhs->size &&
!memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
return true;
return false;
}
/*
* osds + primary
*/
static bool osds_equal(const struct ceph_osds *lhs,
const struct ceph_osds *rhs)
{
if (__osds_equal(lhs, rhs) &&
lhs->primary == rhs->primary)
return true;
return false;
}
static bool osds_valid(const struct ceph_osds *set)
{
/* non-empty set */
if (set->size > 0 && set->primary >= 0)
return true;
/* empty can_shift_osds set */
if (!set->size && set->primary == -1)
return true;
/* empty !can_shift_osds set - all NONE */
if (set->size > 0 && set->primary == -1) {
int i;
for (i = 0; i < set->size; i++) {
if (set->osds[i] != CRUSH_ITEM_NONE)
break;
}
if (i == set->size)
return true;
}
return false;
}
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
{
memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
dest->size = src->size;
dest->primary = src->primary;
}
bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
u32 new_pg_num)
{
int old_bits = calc_bits_of(old_pg_num);
int old_mask = (1 << old_bits) - 1;
int n;
WARN_ON(pgid->seed >= old_pg_num);
if (new_pg_num <= old_pg_num)
return false;
for (n = 1; ; n++) {
int next_bit = n << (old_bits - 1);
u32 s = next_bit | pgid->seed;
if (s < old_pg_num || s == pgid->seed)
continue;
if (s >= new_pg_num)
break;
s = ceph_stable_mod(s, old_pg_num, old_mask);
if (s == pgid->seed)
return true;
}
return false;
}
bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
const struct ceph_osds *old_up,
const struct ceph_osds *new_up,
int old_size,
int new_size,
int old_min_size,
int new_min_size,
u32 old_pg_num,
u32 new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
bool old_recovery_deletes,
bool new_recovery_deletes,
const struct ceph_pg *pgid)
{
return !osds_equal(old_acting, new_acting) ||
!osds_equal(old_up, new_up) ||
old_size != new_size ||
old_min_size != new_min_size ||
ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
old_sort_bitwise != new_sort_bitwise ||
old_recovery_deletes != new_recovery_deletes;
}
static int calc_pg_rank(int osd, const struct ceph_osds *acting)
{
int i;
for (i = 0; i < acting->size; i++) {
if (acting->osds[i] == osd)
return i;
}
return -1;
}
static bool primary_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting)
{
if (!old_acting->size && !new_acting->size)
return false; /* both still empty */
if (!old_acting->size ^ !new_acting->size)
return true; /* was empty, now not, or vice versa */
if (old_acting->primary != new_acting->primary)
return true; /* primary changed */
if (calc_pg_rank(old_acting->primary, old_acting) !=
calc_pg_rank(new_acting->primary, new_acting))
return true;
return false; /* same primary (tho replicas may have changed) */
}
bool ceph_osds_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
bool any_change)
{
if (primary_changed(old_acting, new_acting))
return true;
if (any_change && !__osds_equal(old_acting, new_acting))
return true;
return false;
}
/*
* Map an object into a PG.
*
* Should only be called with target_oid and target_oloc (as opposed to
* base_oid and base_oloc), since tiering isn't taken into account.
*/
void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid)
{
WARN_ON(pi->id != oloc->pool);
if (!oloc->pool_ns) {
raw_pgid->pool = oloc->pool;
raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
oid->name_len);
dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
raw_pgid->pool, raw_pgid->seed);
} else {
char stack_buf[256];
char *buf = stack_buf;
int nsl = oloc->pool_ns->len;
size_t total = nsl + 1 + oid->name_len;
if (total > sizeof(stack_buf))
buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL);
memcpy(buf, oloc->pool_ns->str, nsl);
buf[nsl] = '\037';
memcpy(buf + nsl + 1, oid->name, oid->name_len);
raw_pgid->pool = oloc->pool;
raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
if (buf != stack_buf)
kfree(buf);
dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
oid->name, nsl, oloc->pool_ns->str,
raw_pgid->pool, raw_pgid->seed);
}
}
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
return -ENOENT;
__ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
return 0;
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
/*
* Map a raw PG (full precision ps) into an actual PG.
*/
static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_pg *pgid)
{
pgid->pool = raw_pgid->pool;
pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
pi->pg_num_mask);
}
/*
* Map a raw PG (full precision ps) into a placement ps (placement
* seed). Include pool id in that value so that different pools don't
* use the same seeds.
*/
static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid)
{
if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
/* hash pool id and seed so that pool PGs do not overlap */
return crush_hash32_2(CRUSH_HASH_RJENKINS1,
ceph_stable_mod(raw_pgid->seed,
pi->pgp_num,
pi->pgp_num_mask),
raw_pgid->pool);
} else {
/*
* legacy behavior: add ps and pool together. this is
* not a great approach because the PGs from each pool
* will overlap on top of each other: 0.5 == 1.4 ==
* 2.3 == ...
*/
return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
pi->pgp_num_mask) +
(unsigned)raw_pgid->pool;
}
}
/*
* Magic value used for a "default" fallback choose_args, used if the
* crush_choose_arg_map passed to do_crush() does not exist. If this
* also doesn't exist, fall back to canonical weights.
*/
#define CEPH_DEFAULT_CHOOSE_ARGS -1
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
const __u32 *weight, int weight_max,
s64 choose_args_index)
{
struct crush_choose_arg_map *arg_map;
int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
choose_args_index);
if (!arg_map)
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
CEPH_DEFAULT_CHOOSE_ARGS);
mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
weight, weight_max, map->crush_workspace,
arg_map ? arg_map->args : NULL);
mutex_unlock(&map->crush_workspace_mutex);
return r;
}
static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
struct ceph_osds *set)
{
int i;
if (ceph_can_shift_osds(pi)) {
int removed = 0;
/* shift left */
for (i = 0; i < set->size; i++) {
if (!ceph_osd_exists(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
set->osds[i - removed] = set->osds[i];
}
set->size -= removed;
} else {
/* set dne devices to NONE */
for (i = 0; i < set->size; i++) {
if (!ceph_osd_exists(osdmap, set->osds[i]))
set->osds[i] = CRUSH_ITEM_NONE;
}
}
}
/*
* Calculate raw set (CRUSH output) for given PG and filter out
* nonexistent OSDs. ->primary is undefined for a raw set.
*
* Placement seed (CRUSH input) is returned through @ppps.
*/
static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *raw,
u32 *ppps)
{
u32 pps = raw_pg_to_pps(pi, raw_pgid);
int ruleno;
int len;
ceph_osds_init(raw);
if (ppps)
*ppps = pps;
ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
pi->size);
if (ruleno < 0) {
pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
pi->id, pi->crush_ruleset, pi->type, pi->size);
return;
}
if (pi->size > ARRAY_SIZE(raw->osds)) {
pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n",
pi->id, pi->crush_ruleset, pi->type, pi->size,
ARRAY_SIZE(raw->osds));
return;
}
len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
osdmap->osd_weight, osdmap->max_osd, pi->id);
if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
len, ruleno, pi->id, pi->crush_ruleset, pi->type,
pi->size);
return;
}
raw->size = len;
remove_nonexistent_osds(osdmap, pi, raw);
}
/* apply pg_upmap[_items] mappings */
static void apply_upmap(struct ceph_osdmap *osdmap,
const struct ceph_pg *pgid,
struct ceph_osds *raw)
{
struct ceph_pg_mapping *pg;
int i, j;
pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
if (pg) {
/* make sure targets aren't marked out */
for (i = 0; i < pg->pg_upmap.len; i++) {
int osd = pg->pg_upmap.osds[i];
if (osd != CRUSH_ITEM_NONE &&
osd < osdmap->max_osd &&
osdmap->osd_weight[osd] == 0) {
/* reject/ignore explicit mapping */
return;
}
}
for (i = 0; i < pg->pg_upmap.len; i++)
raw->osds[i] = pg->pg_upmap.osds[i];
raw->size = pg->pg_upmap.len;
/* check and apply pg_upmap_items, if any */
}
pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
if (pg) {
/*
* Note: this approach does not allow a bidirectional swap,
* e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
*/
for (i = 0; i < pg->pg_upmap_items.len; i++) {
int from = pg->pg_upmap_items.from_to[i][0];
int to = pg->pg_upmap_items.from_to[i][1];
int pos = -1;
bool exists = false;
/* make sure replacement doesn't already appear */
for (j = 0; j < raw->size; j++) {
int osd = raw->osds[j];
if (osd == to) {
exists = true;
break;
}
/* ignore mapping if target is marked out */
if (osd == from && pos < 0 &&
!(to != CRUSH_ITEM_NONE &&
to < osdmap->max_osd &&
osdmap->osd_weight[to] == 0)) {
pos = j;
}
}
if (!exists && pos >= 0)
raw->osds[pos] = to;
}
}
}
/*
* Given raw set, calculate up set and up primary. By definition of an
* up set, the result won't contain nonexistent or down OSDs.
*
* This is done in-place - on return @set is the up set. If it's
* empty, ->primary will remain undefined.
*/
static void raw_to_up_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
struct ceph_osds *set)
{
int i;
/* ->primary is undefined for a raw set */
BUG_ON(set->primary != -1);
if (ceph_can_shift_osds(pi)) {
int removed = 0;
/* shift left */
for (i = 0; i < set->size; i++) {
if (ceph_osd_is_down(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
set->osds[i - removed] = set->osds[i];
}
set->size -= removed;
if (set->size > 0)
set->primary = set->osds[0];
} else {
/* set down/dne devices to NONE */
for (i = set->size - 1; i >= 0; i--) {
if (ceph_osd_is_down(osdmap, set->osds[i]))
set->osds[i] = CRUSH_ITEM_NONE;
else
set->primary = set->osds[i];
}
}
}
static void apply_primary_affinity(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
u32 pps,
struct ceph_osds *up)
{
int i;
int pos = -1;
/*
* Do we have any non-default primary_affinity values for these
* osds?
*/
if (!osdmap->osd_primary_affinity)
return;
for (i = 0; i < up->size; i++) {
int osd = up->osds[i];
if (osd != CRUSH_ITEM_NONE &&
osdmap->osd_primary_affinity[osd] !=
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
break;
}
}
if (i == up->size)
return;
/*
* Pick the primary. Feed both the seed (for the pg) and the
* osd into the hash/rng so that a proportional fraction of an
* osd's pgs get rejected as primary.
*/
for (i = 0; i < up->size; i++) {
int osd = up->osds[i];
u32 aff;
if (osd == CRUSH_ITEM_NONE)
continue;
aff = osdmap->osd_primary_affinity[osd];
if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
(crush_hash32_2(CRUSH_HASH_RJENKINS1,
pps, osd) >> 16) >= aff) {
/*
* We chose not to use this primary. Note it
* anyway as a fallback in case we don't pick
* anyone else, but keep looking.
*/
if (pos < 0)
pos = i;
} else {
pos = i;
break;
}
}
if (pos < 0)
return;
up->primary = up->osds[pos];
if (ceph_can_shift_osds(pi) && pos > 0) {
/* move the new primary to the front */
for (i = pos; i > 0; i--)
up->osds[i] = up->osds[i - 1];
up->osds[0] = up->primary;
}
}
/*
* Get pg_temp and primary_temp mappings for given PG.
*
* Note that a PG may have none, only pg_temp, only primary_temp or
* both pg_temp and primary_temp mappings. This means @temp isn't
* always a valid OSD set on return: in the "only primary_temp" case,
* @temp will have its ->primary >= 0 but ->size == 0.
*/
static void get_temp_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *pgid,
struct ceph_osds *temp)
{
struct ceph_pg_mapping *pg;
int i;
ceph_osds_init(temp);
/* pg_temp? */
pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) {
for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
if (ceph_can_shift_osds(pi))
continue;
temp->osds[temp->size++] = CRUSH_ITEM_NONE;
} else {
temp->osds[temp->size++] = pg->pg_temp.osds[i];
}
}
/* apply pg_temp's primary */
for (i = 0; i < temp->size; i++) {
if (temp->osds[i] != CRUSH_ITEM_NONE) {
temp->primary = temp->osds[i];
break;
}
}
}
/* primary_temp? */
pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg)
temp->primary = pg->primary_temp.osd;
}
/*
* Map a PG to its acting set as well as its up set.
*
* Acting set is used for data mapping purposes, while up set can be
* recorded for detecting interval changes and deciding whether to
* resend a request.
*/
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *up,
struct ceph_osds *acting)
{
struct ceph_pg pgid;
u32 pps;
WARN_ON(pi->id != raw_pgid->pool);
raw_pg_to_pg(pi, raw_pgid, &pgid);
pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
apply_upmap(osdmap, &pgid, up);
raw_to_up_osds(osdmap, pi, up);
apply_primary_affinity(osdmap, pi, pps, up);
get_temp_osds(osdmap, pi, &pgid, acting);
if (!acting->size) {
memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
acting->size = up->size;
if (acting->primary == -1)
acting->primary = up->primary;
}
WARN_ON(!osds_valid(up) || !osds_valid(acting));
}
bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_spg *spgid)
{
struct ceph_pg pgid;
struct ceph_osds up, acting;
int i;
WARN_ON(pi->id != raw_pgid->pool);
raw_pg_to_pg(pi, raw_pgid, &pgid);
if (ceph_can_shift_osds(pi)) {
spgid->pgid = pgid; /* struct */
spgid->shard = CEPH_SPG_NOSHARD;
return true;
}
ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
for (i = 0; i < acting.size; i++) {
if (acting.osds[i] == acting.primary) {
spgid->pgid = pgid; /* struct */
spgid->shard = i;
return true;
}
}
return false;
}
/*
* Return acting primary for given PG, or -1 if none.
*/
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
struct ceph_osds up, acting;
pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
if (!pi)
return -1;
ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
return acting.primary;
}
EXPORT_SYMBOL(ceph_pg_to_acting_primary);
static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
size_t name_len)
{
struct crush_loc_node *loc;
loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
if (!loc)
return NULL;
RB_CLEAR_NODE(&loc->cl_node);
return loc;
}
static void free_crush_loc(struct crush_loc_node *loc)
{
WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
kfree(loc);
}
static int crush_loc_compare(const struct crush_loc *loc1,
const struct crush_loc *loc2)
{
return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
strcmp(loc1->cl_name, loc2->cl_name);
}
DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
RB_BYPTR, const struct crush_loc *, cl_node)
/*
* Parses a set of <bucket type name>':'<bucket name> pairs separated
* by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
*
* Note that @crush_location is modified by strsep().
*/
int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
{
struct crush_loc_node *loc;
const char *type_name, *name, *colon;
size_t type_name_len, name_len;
dout("%s '%s'\n", __func__, crush_location);
while ((type_name = strsep(&crush_location, "|"))) {
colon = strchr(type_name, ':');
if (!colon)
return -EINVAL;
type_name_len = colon - type_name;
if (type_name_len == 0)
return -EINVAL;
name = colon + 1;
name_len = strlen(name);
if (name_len == 0)
return -EINVAL;
loc = alloc_crush_loc(type_name_len, name_len);
if (!loc)
return -ENOMEM;
loc->cl_loc.cl_type_name = loc->cl_data;
memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
loc->cl_loc.cl_type_name[type_name_len] = '\0';
loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
memcpy(loc->cl_loc.cl_name, name, name_len);
loc->cl_loc.cl_name[name_len] = '\0';
if (!__insert_crush_loc(locs, loc)) {
free_crush_loc(loc);
return -EEXIST;
}
dout("%s type_name '%s' name '%s'\n", __func__,
loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
}
return 0;
}
int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
{
struct rb_node *n1 = rb_first(locs1);
struct rb_node *n2 = rb_first(locs2);
int ret;
for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
struct crush_loc_node *loc1 =
rb_entry(n1, struct crush_loc_node, cl_node);
struct crush_loc_node *loc2 =
rb_entry(n2, struct crush_loc_node, cl_node);
ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
if (ret)
return ret;
}
if (!n1 && n2)
return -1;
if (n1 && !n2)
return 1;
return 0;
}
void ceph_clear_crush_locs(struct rb_root *locs)
{
while (!RB_EMPTY_ROOT(locs)) {
struct crush_loc_node *loc =
rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
erase_crush_loc(locs, loc);
free_crush_loc(loc);
}
}
/*
* [a-zA-Z0-9-_.]+
*/
static bool is_valid_crush_name(const char *name)
{
do {
if (!('a' <= *name && *name <= 'z') &&
!('A' <= *name && *name <= 'Z') &&
!('0' <= *name && *name <= '9') &&
*name != '-' && *name != '_' && *name != '.')
return false;
} while (*++name != '\0');
return true;
}
/*
* Gets the parent of an item. Returns its id (<0 because the
* parent is always a bucket), type id (>0 for the same reason,
* via @parent_type_id) and location (via @parent_loc). If no
* parent, returns 0.
*
* Does a linear search, as there are no parent pointers of any
* kind. Note that the result is ambigous for items that occur
* multiple times in the map.
*/
static int get_immediate_parent(struct crush_map *c, int id,
u16 *parent_type_id,
struct crush_loc *parent_loc)
{
struct crush_bucket *b;
struct crush_name_node *type_cn, *cn;
int i, j;
for (i = 0; i < c->max_buckets; i++) {
b = c->buckets[i];
if (!b)
continue;
/* ignore per-class shadow hierarchy */
cn = lookup_crush_name(&c->names, b->id);
if (!cn || !is_valid_crush_name(cn->cn_name))
continue;
for (j = 0; j < b->size; j++) {
if (b->items[j] != id)
continue;
*parent_type_id = b->type;
type_cn = lookup_crush_name(&c->type_names, b->type);
parent_loc->cl_type_name = type_cn->cn_name;
parent_loc->cl_name = cn->cn_name;
return b->id;
}
}
return 0; /* no parent */
}
/*
* Calculates the locality/distance from an item to a client
* location expressed in terms of CRUSH hierarchy as a set of
* (bucket type name, bucket name) pairs. Specifically, looks
* for the lowest-valued bucket type for which the location of
* @id matches one of the locations in @locs, so for standard
* bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
* a matching host is closer than a matching rack and a matching
* data center is closer than a matching zone.
*
* Specifying multiple locations (a "multipath" location) such
* as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
* is a multimap. The locality will be:
*
* - 3 for OSDs in racks foo1 and foo2
* - 8 for OSDs in data center bar
* - -1 for all other OSDs
*
* The lowest possible bucket type is 1, so the best locality
* for an OSD is 1 (i.e. a matching host). Locality 0 would be
* the OSD itself.
*/
int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
struct rb_root *locs)
{
struct crush_loc loc;
u16 type_id;
/*
* Instead of repeated get_immediate_parent() calls,
* the location of @id could be obtained with a single
* depth-first traversal.
*/
for (;;) {
id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
if (id >= 0)
return -1; /* not local */
if (lookup_crush_loc(locs, &loc))
return type_id;
}
}