2013-04-05 14:27:12 +08:00
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
#include <linux/ceph/ceph_debug.h>
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
#include <linux/module.h>
|
2009-10-07 02:31:10 +08:00
|
|
|
#include <linux/err.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/uaccess.h>
|
2010-04-07 06:01:27 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
#include <linux/bio.h>
|
|
|
|
#endif
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2010-04-07 06:14:15 +08:00
|
|
|
#include <linux/ceph/libceph.h>
|
|
|
|
#include <linux/ceph/osd_client.h>
|
|
|
|
#include <linux/ceph/messenger.h>
|
|
|
|
#include <linux/ceph/decode.h>
|
|
|
|
#include <linux/ceph/auth.h>
|
|
|
|
#include <linux/ceph/pagelist.h>
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2010-03-02 05:02:00 +08:00
|
|
|
#define OSD_OPREPLY_FRONT_LEN 512
|
2010-01-14 09:03:23 +08:00
|
|
|
|
2013-05-02 01:43:04 +08:00
|
|
|
static struct kmem_cache *ceph_osd_request_cache;
|
|
|
|
|
2010-05-20 16:40:19 +08:00
|
|
|
static const struct ceph_connection_operations osd_con_ops;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-02-16 01:42:29 +08:00
|
|
|
static void __send_queued(struct ceph_osd_client *osdc);
|
2011-01-18 12:34:08 +08:00
|
|
|
static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
|
2011-03-22 06:07:16 +08:00
|
|
|
static void __register_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req);
|
2014-09-03 18:41:45 +08:00
|
|
|
static void __unregister_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req);
|
2011-03-22 06:07:16 +08:00
|
|
|
static void __unregister_linger_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req);
|
2014-09-03 18:41:45 +08:00
|
|
|
static void __enqueue_request(struct ceph_osd_request *req);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Implement client access to distributed object storage cluster.
|
|
|
|
*
|
|
|
|
* All data objects are stored within a cluster/cloud of OSDs, or
|
|
|
|
* "object storage devices." (Note that Ceph OSDs have _nothing_ to
|
|
|
|
* do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
|
|
|
|
* remote daemons serving up and coordinating consistent and safe
|
|
|
|
* access to storage.
|
|
|
|
*
|
|
|
|
* Cluster membership and the mapping of data objects onto storage devices
|
|
|
|
* are described by the osd map.
|
|
|
|
*
|
|
|
|
* We keep track of pending OSD requests (read, write), resubmit
|
|
|
|
* requests to different OSDs when the cluster topology/data layout
|
|
|
|
* change, or retry the affected requests when the communications
|
|
|
|
* channel with an OSD is reset.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* calculate the mapping of a file extent onto an object, and fill out the
|
|
|
|
* request accordingly. shorten extent as necessary if it crosses an
|
|
|
|
* object boundary.
|
|
|
|
*
|
|
|
|
* fill osd op in request message.
|
|
|
|
*/
|
2013-02-16 12:10:17 +08:00
|
|
|
static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
|
2013-03-14 09:50:01 +08:00
|
|
|
u64 *objnum, u64 *objoff, u64 *objlen)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2013-02-16 01:42:29 +08:00
|
|
|
u64 orig_len = *plen;
|
2012-09-25 11:59:48 +08:00
|
|
|
int r;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-02-16 01:42:29 +08:00
|
|
|
/* object extent? */
|
2013-03-14 09:50:00 +08:00
|
|
|
r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
|
|
|
|
objoff, objlen);
|
2012-09-25 11:59:48 +08:00
|
|
|
if (r < 0)
|
|
|
|
return r;
|
2013-03-14 09:50:00 +08:00
|
|
|
if (*objlen < orig_len) {
|
|
|
|
*plen = *objlen;
|
2013-02-16 01:42:29 +08:00
|
|
|
dout(" skipping last %llu, final file extent %llu~%llu\n",
|
|
|
|
orig_len - *plen, off, *plen);
|
|
|
|
}
|
|
|
|
|
2013-03-14 09:50:00 +08:00
|
|
|
dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-02-16 12:10:17 +08:00
|
|
|
return 0;
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2013-04-03 14:28:57 +08:00
|
|
|
static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
|
|
|
|
{
|
|
|
|
memset(osd_data, 0, sizeof (*osd_data));
|
|
|
|
osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
|
|
|
|
}
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
|
2013-04-03 14:28:57 +08:00
|
|
|
struct page **pages, u64 length, u32 alignment,
|
|
|
|
bool pages_from_pool, bool own_pages)
|
|
|
|
{
|
|
|
|
osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
|
|
|
|
osd_data->pages = pages;
|
|
|
|
osd_data->length = length;
|
|
|
|
osd_data->alignment = alignment;
|
|
|
|
osd_data->pages_from_pool = pages_from_pool;
|
|
|
|
osd_data->own_pages = own_pages;
|
|
|
|
}
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
|
2013-04-03 14:28:57 +08:00
|
|
|
struct ceph_pagelist *pagelist)
|
|
|
|
{
|
|
|
|
osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
|
|
|
|
osd_data->pagelist = pagelist;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLOCK
|
2013-04-05 14:27:12 +08:00
|
|
|
static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
|
2013-04-03 14:28:57 +08:00
|
|
|
struct bio *bio, size_t bio_length)
|
|
|
|
{
|
|
|
|
osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
|
|
|
|
osd_data->bio = bio;
|
|
|
|
osd_data->bio_length = bio_length;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_BLOCK */
|
|
|
|
|
2015-10-22 23:06:07 +08:00
|
|
|
#define osd_req_op_data(oreq, whch, typ, fld) \
|
|
|
|
({ \
|
|
|
|
struct ceph_osd_request *__oreq = (oreq); \
|
|
|
|
unsigned int __whch = (whch); \
|
|
|
|
BUG_ON(__whch >= __oreq->r_num_ops); \
|
|
|
|
&__oreq->r_ops[__whch].typ.fld; \
|
|
|
|
})
|
2013-04-16 03:50:36 +08:00
|
|
|
|
2013-02-12 02:33:24 +08:00
|
|
|
static struct ceph_osd_data *
|
|
|
|
osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
|
|
|
|
{
|
|
|
|
BUG_ON(which >= osd_req->r_num_ops);
|
|
|
|
|
|
|
|
return &osd_req->r_ops[which].raw_data_in;
|
|
|
|
}
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
struct ceph_osd_data *
|
|
|
|
osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
|
2013-04-16 03:50:36 +08:00
|
|
|
unsigned int which)
|
2013-04-05 14:27:12 +08:00
|
|
|
{
|
2013-04-16 03:50:36 +08:00
|
|
|
return osd_req_op_data(osd_req, which, extent, osd_data);
|
2013-04-05 14:27:12 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_osd_data);
|
|
|
|
|
2013-02-12 02:33:24 +08:00
|
|
|
void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, struct page **pages,
|
|
|
|
u64 length, u32 alignment,
|
|
|
|
bool pages_from_pool, bool own_pages)
|
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
|
|
|
osd_data = osd_req_op_raw_data_in(osd_req, which);
|
|
|
|
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
|
|
|
|
pages_from_pool, own_pages);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
|
2013-04-16 03:50:36 +08:00
|
|
|
unsigned int which, struct page **pages,
|
|
|
|
u64 length, u32 alignment,
|
2013-04-05 14:27:12 +08:00
|
|
|
bool pages_from_pool, bool own_pages)
|
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
|
|
|
|
pages_from_pool, own_pages);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
|
|
|
|
|
|
|
|
void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
|
2013-04-16 03:50:36 +08:00
|
|
|
unsigned int which, struct ceph_pagelist *pagelist)
|
2013-04-05 14:27:12 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_pagelist_init(osd_data, pagelist);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
|
2013-04-16 03:50:36 +08:00
|
|
|
unsigned int which, struct bio *bio, size_t bio_length)
|
2013-04-05 14:27:12 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
2013-04-16 03:50:36 +08:00
|
|
|
|
|
|
|
osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_bio_init(osd_data, bio, bio_length);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
|
|
|
|
#endif /* CONFIG_BLOCK */
|
|
|
|
|
|
|
|
static void osd_req_op_cls_request_info_pagelist(
|
|
|
|
struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, struct ceph_pagelist *pagelist)
|
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_data = osd_req_op_data(osd_req, which, cls, request_info);
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_pagelist_init(osd_data, pagelist);
|
|
|
|
}
|
|
|
|
|
2013-04-06 03:46:02 +08:00
|
|
|
void osd_req_op_cls_request_data_pagelist(
|
|
|
|
struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, struct ceph_pagelist *pagelist)
|
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
|
2013-04-06 03:46:02 +08:00
|
|
|
ceph_osd_data_pagelist_init(osd_data, pagelist);
|
2016-05-26 06:29:52 +08:00
|
|
|
osd_req->r_ops[which].cls.indata_len += pagelist->length;
|
|
|
|
osd_req->r_ops[which].indata_len += pagelist->length;
|
2013-04-06 03:46:02 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
|
|
|
|
|
2013-04-20 04:34:49 +08:00
|
|
|
void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, struct page **pages, u64 length,
|
|
|
|
u32 alignment, bool pages_from_pool, bool own_pages)
|
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
|
|
|
osd_data = osd_req_op_data(osd_req, which, cls, request_data);
|
|
|
|
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
|
|
|
|
pages_from_pool, own_pages);
|
2016-05-26 06:29:52 +08:00
|
|
|
osd_req->r_ops[which].cls.indata_len += length;
|
|
|
|
osd_req->r_ops[which].indata_len += length;
|
2013-04-20 04:34:49 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, struct page **pages, u64 length,
|
|
|
|
u32 alignment, bool pages_from_pool, bool own_pages)
|
|
|
|
{
|
|
|
|
struct ceph_osd_data *osd_data;
|
|
|
|
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_data = osd_req_op_data(osd_req, which, cls, response_data);
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_pages_init(osd_data, pages, length, alignment,
|
|
|
|
pages_from_pool, own_pages);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
|
|
|
|
|
2013-04-03 14:28:58 +08:00
|
|
|
static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
|
|
|
|
{
|
|
|
|
switch (osd_data->type) {
|
|
|
|
case CEPH_OSD_DATA_TYPE_NONE:
|
|
|
|
return 0;
|
|
|
|
case CEPH_OSD_DATA_TYPE_PAGES:
|
|
|
|
return osd_data->length;
|
|
|
|
case CEPH_OSD_DATA_TYPE_PAGELIST:
|
|
|
|
return (u64)osd_data->pagelist->length;
|
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
case CEPH_OSD_DATA_TYPE_BIO:
|
|
|
|
return (u64)osd_data->bio_length;
|
|
|
|
#endif /* CONFIG_BLOCK */
|
|
|
|
default:
|
|
|
|
WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-03 14:28:57 +08:00
|
|
|
static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
|
|
|
|
{
|
2013-04-05 14:27:12 +08:00
|
|
|
if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
|
2013-04-03 14:28:57 +08:00
|
|
|
int num_pages;
|
|
|
|
|
|
|
|
num_pages = calc_pages_for((u64)osd_data->alignment,
|
|
|
|
(u64)osd_data->length);
|
|
|
|
ceph_release_page_vector(osd_data->pages, num_pages);
|
|
|
|
}
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_init(osd_data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which)
|
|
|
|
{
|
|
|
|
struct ceph_osd_req_op *op;
|
|
|
|
|
|
|
|
BUG_ON(which >= osd_req->r_num_ops);
|
|
|
|
op = &osd_req->r_ops[which];
|
|
|
|
|
|
|
|
switch (op->op) {
|
|
|
|
case CEPH_OSD_OP_READ:
|
|
|
|
case CEPH_OSD_OP_WRITE:
|
2015-10-07 23:27:17 +08:00
|
|
|
case CEPH_OSD_OP_WRITEFULL:
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_release(&op->extent.osd_data);
|
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_CALL:
|
|
|
|
ceph_osd_data_release(&op->cls.request_info);
|
2013-04-06 03:46:02 +08:00
|
|
|
ceph_osd_data_release(&op->cls.request_data);
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_osd_data_release(&op->cls.response_data);
|
|
|
|
break;
|
2014-11-12 14:00:43 +08:00
|
|
|
case CEPH_OSD_OP_SETXATTR:
|
|
|
|
case CEPH_OSD_OP_CMPXATTR:
|
|
|
|
ceph_osd_data_release(&op->xattr.osd_data);
|
|
|
|
break;
|
2015-04-27 11:02:35 +08:00
|
|
|
case CEPH_OSD_OP_STAT:
|
|
|
|
ceph_osd_data_release(&op->raw_data_in);
|
|
|
|
break;
|
2013-04-05 14:27:12 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2013-04-03 14:28:57 +08:00
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:23 +08:00
|
|
|
/*
|
|
|
|
* Assumes @t is zero-initialized.
|
|
|
|
*/
|
|
|
|
static void target_init(struct ceph_osd_request_target *t)
|
|
|
|
{
|
|
|
|
ceph_oid_init(&t->base_oid);
|
|
|
|
ceph_oloc_init(&t->base_oloc);
|
|
|
|
ceph_oid_init(&t->target_oid);
|
|
|
|
ceph_oloc_init(&t->target_oloc);
|
|
|
|
|
|
|
|
ceph_osds_init(&t->acting);
|
|
|
|
ceph_osds_init(&t->up);
|
|
|
|
t->size = -1;
|
|
|
|
t->min_size = -1;
|
|
|
|
|
|
|
|
t->osd = CEPH_HOMELESS_OSD;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void target_destroy(struct ceph_osd_request_target *t)
|
|
|
|
{
|
|
|
|
ceph_oid_destroy(&t->base_oid);
|
|
|
|
ceph_oid_destroy(&t->target_oid);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* requests
|
|
|
|
*/
|
2014-06-20 18:14:42 +08:00
|
|
|
static void ceph_osdc_release_request(struct kref *kref)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2014-06-20 18:14:42 +08:00
|
|
|
struct ceph_osd_request *req = container_of(kref,
|
|
|
|
struct ceph_osd_request, r_kref);
|
2013-04-05 14:27:12 +08:00
|
|
|
unsigned int which;
|
2009-12-08 05:37:03 +08:00
|
|
|
|
2014-06-20 18:14:42 +08:00
|
|
|
dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
|
|
|
|
req->r_request, req->r_reply);
|
2014-06-20 18:14:42 +08:00
|
|
|
WARN_ON(!RB_EMPTY_NODE(&req->r_node));
|
|
|
|
WARN_ON(!list_empty(&req->r_req_lru_item));
|
|
|
|
WARN_ON(!list_empty(&req->r_osd_item));
|
|
|
|
WARN_ON(!list_empty(&req->r_linger_item));
|
|
|
|
WARN_ON(!list_empty(&req->r_linger_osd_item));
|
|
|
|
WARN_ON(req->r_osd);
|
2014-06-20 18:14:42 +08:00
|
|
|
|
2009-12-08 05:37:03 +08:00
|
|
|
if (req->r_request)
|
|
|
|
ceph_msg_put(req->r_request);
|
2013-04-02 05:12:14 +08:00
|
|
|
if (req->r_reply) {
|
2012-06-02 03:56:43 +08:00
|
|
|
ceph_msg_revoke_incoming(req->r_reply);
|
2012-06-05 03:43:32 +08:00
|
|
|
ceph_msg_put(req->r_reply);
|
2013-04-02 05:12:14 +08:00
|
|
|
}
|
2013-02-15 02:16:43 +08:00
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
for (which = 0; which < req->r_num_ops; which++)
|
|
|
|
osd_req_op_data_release(req, which);
|
2013-02-15 02:16:43 +08:00
|
|
|
|
2016-04-28 22:07:23 +08:00
|
|
|
target_destroy(&req->r_t);
|
2009-12-08 05:37:03 +08:00
|
|
|
ceph_put_snap_context(req->r_snapc);
|
2016-04-30 01:54:20 +08:00
|
|
|
|
2009-12-08 05:37:03 +08:00
|
|
|
if (req->r_mempool)
|
|
|
|
mempool_free(req, req->r_osdc->req_mempool);
|
2016-02-10 00:50:15 +08:00
|
|
|
else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
|
2013-05-02 01:43:04 +08:00
|
|
|
kmem_cache_free(ceph_osd_request_cache, req);
|
2016-02-10 00:50:15 +08:00
|
|
|
else
|
|
|
|
kfree(req);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2014-06-20 18:14:42 +08:00
|
|
|
|
|
|
|
void ceph_osdc_get_request(struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
dout("%s %p (was %d)\n", __func__, req,
|
|
|
|
atomic_read(&req->r_kref.refcount));
|
|
|
|
kref_get(&req->r_kref);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_get_request);
|
|
|
|
|
|
|
|
void ceph_osdc_put_request(struct ceph_osd_request *req)
|
|
|
|
{
|
2016-04-26 21:05:29 +08:00
|
|
|
if (req) {
|
|
|
|
dout("%s %p (was %d)\n", __func__, req,
|
|
|
|
atomic_read(&req->r_kref.refcount));
|
|
|
|
kref_put(&req->r_kref, ceph_osdc_release_request);
|
|
|
|
}
|
2014-06-20 18:14:42 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_put_request);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2010-04-07 05:51:47 +08:00
|
|
|
struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
|
2009-10-07 02:31:10 +08:00
|
|
|
struct ceph_snap_context *snapc,
|
2013-02-26 08:11:12 +08:00
|
|
|
unsigned int num_ops,
|
2010-04-07 05:51:47 +08:00
|
|
|
bool use_mempool,
|
2012-11-14 11:11:15 +08:00
|
|
|
gfp_t gfp_flags)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_request *req;
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
if (use_mempool) {
|
2016-02-10 00:50:15 +08:00
|
|
|
BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
|
2010-04-07 05:51:47 +08:00
|
|
|
req = mempool_alloc(osdc->req_mempool, gfp_flags);
|
2016-02-10 00:50:15 +08:00
|
|
|
} else if (num_ops <= CEPH_OSD_SLAB_OPS) {
|
|
|
|
req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
|
2009-10-07 02:31:10 +08:00
|
|
|
} else {
|
2016-02-10 00:50:15 +08:00
|
|
|
BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
|
|
|
|
req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
|
|
|
|
gfp_flags);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2016-02-10 00:50:15 +08:00
|
|
|
if (unlikely(!req))
|
2010-04-02 07:06:19 +08:00
|
|
|
return NULL;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-02-10 00:50:15 +08:00
|
|
|
/* req only, each op is zeroed in _osd_req_op_init() */
|
|
|
|
memset(req, 0, sizeof(*req));
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
req->r_osdc = osdc;
|
|
|
|
req->r_mempool = use_mempool;
|
2013-04-04 10:32:51 +08:00
|
|
|
req->r_num_ops = num_ops;
|
2016-04-26 21:39:47 +08:00
|
|
|
req->r_snapid = CEPH_NOSNAP;
|
|
|
|
req->r_snapc = ceph_get_snap_context(snapc);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2009-12-08 05:37:03 +08:00
|
|
|
kref_init(&req->r_kref);
|
2009-10-07 02:31:10 +08:00
|
|
|
init_completion(&req->r_completion);
|
|
|
|
init_completion(&req->r_safe_completion);
|
2012-12-18 02:23:48 +08:00
|
|
|
RB_CLEAR_NODE(&req->r_node);
|
2009-10-07 02:31:10 +08:00
|
|
|
INIT_LIST_HEAD(&req->r_unsafe_item);
|
2011-03-22 06:07:16 +08:00
|
|
|
INIT_LIST_HEAD(&req->r_linger_item);
|
2014-06-20 18:14:41 +08:00
|
|
|
INIT_LIST_HEAD(&req->r_linger_osd_item);
|
2011-09-17 02:13:17 +08:00
|
|
|
INIT_LIST_HEAD(&req->r_req_lru_item);
|
2012-07-10 05:31:41 +08:00
|
|
|
INIT_LIST_HEAD(&req->r_osd_item);
|
|
|
|
|
2016-04-28 22:07:23 +08:00
|
|
|
target_init(&req->r_t);
|
2014-01-27 23:40:18 +08:00
|
|
|
|
2016-04-27 20:15:51 +08:00
|
|
|
dout("%s req %p\n", __func__, req);
|
|
|
|
return req;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_alloc_request);
|
2016-02-10 00:50:15 +08:00
|
|
|
|
2016-04-27 20:15:51 +08:00
|
|
|
int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc = req->r_osdc;
|
|
|
|
struct ceph_msg *msg;
|
|
|
|
int msg_size;
|
2010-03-02 05:02:00 +08:00
|
|
|
|
2016-04-30 01:54:20 +08:00
|
|
|
WARN_ON(ceph_oid_empty(&req->r_base_oid));
|
|
|
|
|
2016-04-27 20:15:51 +08:00
|
|
|
/* create request message */
|
2016-02-11 20:09:15 +08:00
|
|
|
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
|
|
|
|
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
|
|
|
|
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
|
|
|
|
msg_size += 1 + 8 + 4 + 4; /* pgid */
|
2016-04-27 20:15:51 +08:00
|
|
|
msg_size += 4 + req->r_base_oid.name_len; /* oid */
|
|
|
|
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
|
2016-02-11 20:09:15 +08:00
|
|
|
msg_size += 8; /* snapid */
|
|
|
|
msg_size += 8; /* snap_seq */
|
2016-04-27 20:15:51 +08:00
|
|
|
msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
|
2016-02-11 20:09:15 +08:00
|
|
|
msg_size += 4; /* retry_attempt */
|
|
|
|
|
2016-04-27 20:15:51 +08:00
|
|
|
if (req->r_mempool)
|
2009-10-15 08:36:07 +08:00
|
|
|
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
|
2009-10-07 02:31:10 +08:00
|
|
|
else
|
2016-04-27 20:15:51 +08:00
|
|
|
msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
|
|
|
|
if (!msg)
|
|
|
|
return -ENOMEM;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
memset(msg->front.iov_base, 0, msg->front.iov_len);
|
2010-04-07 05:51:47 +08:00
|
|
|
req->r_request = msg;
|
|
|
|
|
2016-04-27 20:15:51 +08:00
|
|
|
/* create reply message */
|
|
|
|
msg_size = OSD_OPREPLY_FRONT_LEN;
|
2016-04-28 00:32:56 +08:00
|
|
|
msg_size += req->r_base_oid.name_len;
|
|
|
|
msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
|
2016-04-27 20:15:51 +08:00
|
|
|
|
|
|
|
if (req->r_mempool)
|
|
|
|
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
|
|
|
|
else
|
|
|
|
msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
|
|
|
|
if (!msg)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
req->r_reply = msg;
|
|
|
|
|
|
|
|
return 0;
|
2010-04-07 05:51:47 +08:00
|
|
|
}
|
2016-04-27 20:15:51 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_alloc_messages);
|
2010-04-07 05:51:47 +08:00
|
|
|
|
2013-03-14 09:50:00 +08:00
|
|
|
static bool osd_req_opcode_valid(u16 opcode)
|
2010-04-07 06:01:27 +08:00
|
|
|
{
|
2013-03-14 09:50:00 +08:00
|
|
|
switch (opcode) {
|
2014-10-02 21:22:29 +08:00
|
|
|
#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true;
|
|
|
|
__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
|
|
|
|
#undef GENERATE_CASE
|
2013-03-14 09:50:00 +08:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
/*
|
|
|
|
* This is an osd op init function for opcodes that have no data or
|
|
|
|
* other information associated with them. It also serves as a
|
|
|
|
* common init routine for all the other init functions, below.
|
|
|
|
*/
|
2013-04-05 14:27:11 +08:00
|
|
|
static struct ceph_osd_req_op *
|
2013-02-12 02:33:24 +08:00
|
|
|
_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
|
2015-04-27 11:09:54 +08:00
|
|
|
u16 opcode, u32 flags)
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
{
|
2013-04-05 14:27:11 +08:00
|
|
|
struct ceph_osd_req_op *op;
|
|
|
|
|
|
|
|
BUG_ON(which >= osd_req->r_num_ops);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
BUG_ON(!osd_req_opcode_valid(opcode));
|
|
|
|
|
2013-04-05 14:27:11 +08:00
|
|
|
op = &osd_req->r_ops[which];
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
memset(op, 0, sizeof (*op));
|
|
|
|
op->op = opcode;
|
2015-04-27 11:09:54 +08:00
|
|
|
op->flags = flags;
|
2013-04-05 14:27:11 +08:00
|
|
|
|
|
|
|
return op;
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
}
|
|
|
|
|
2013-02-12 02:33:24 +08:00
|
|
|
void osd_req_op_init(struct ceph_osd_request *osd_req,
|
2015-04-27 11:09:54 +08:00
|
|
|
unsigned int which, u16 opcode, u32 flags)
|
2013-02-12 02:33:24 +08:00
|
|
|
{
|
2015-04-27 11:09:54 +08:00
|
|
|
(void)_osd_req_op_init(osd_req, which, opcode, flags);
|
2013-02-12 02:33:24 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_init);
|
|
|
|
|
2013-04-05 14:27:11 +08:00
|
|
|
void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, u16 opcode,
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
u64 offset, u64 length,
|
|
|
|
u64 truncate_size, u32 truncate_seq)
|
|
|
|
{
|
2015-04-27 11:09:54 +08:00
|
|
|
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
|
|
|
opcode, 0);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
size_t payload_len = 0;
|
|
|
|
|
2013-08-15 11:51:44 +08:00
|
|
|
BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
|
2015-10-07 23:27:17 +08:00
|
|
|
opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
|
|
|
|
opcode != CEPH_OSD_OP_TRUNCATE);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
|
|
|
|
op->extent.offset = offset;
|
|
|
|
op->extent.length = length;
|
|
|
|
op->extent.truncate_size = truncate_size;
|
|
|
|
op->extent.truncate_seq = truncate_seq;
|
2015-10-07 23:27:17 +08:00
|
|
|
if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
payload_len += length;
|
|
|
|
|
2016-02-08 20:39:46 +08:00
|
|
|
op->indata_len = payload_len;
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_init);
|
|
|
|
|
2013-04-05 14:27:11 +08:00
|
|
|
void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, u64 length)
|
2013-03-15 03:09:05 +08:00
|
|
|
{
|
2013-04-05 14:27:11 +08:00
|
|
|
struct ceph_osd_req_op *op;
|
|
|
|
u64 previous;
|
|
|
|
|
|
|
|
BUG_ON(which >= osd_req->r_num_ops);
|
|
|
|
op = &osd_req->r_ops[which];
|
|
|
|
previous = op->extent.length;
|
2013-03-15 03:09:05 +08:00
|
|
|
|
|
|
|
if (length == previous)
|
|
|
|
return; /* Nothing to do */
|
|
|
|
BUG_ON(length > previous);
|
|
|
|
|
|
|
|
op->extent.length = length;
|
2016-02-08 20:39:46 +08:00
|
|
|
op->indata_len -= previous - length;
|
2013-03-15 03:09:05 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_update);
|
|
|
|
|
2016-01-07 17:32:54 +08:00
|
|
|
void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, u64 offset_inc)
|
|
|
|
{
|
|
|
|
struct ceph_osd_req_op *op, *prev_op;
|
|
|
|
|
|
|
|
BUG_ON(which + 1 >= osd_req->r_num_ops);
|
|
|
|
|
|
|
|
prev_op = &osd_req->r_ops[which];
|
|
|
|
op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
|
|
|
|
/* dup previous one */
|
|
|
|
op->indata_len = prev_op->indata_len;
|
|
|
|
op->outdata_len = prev_op->outdata_len;
|
|
|
|
op->extent = prev_op->extent;
|
|
|
|
/* adjust offset */
|
|
|
|
op->extent.offset += offset_inc;
|
|
|
|
op->extent.length -= offset_inc;
|
|
|
|
|
|
|
|
if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
|
|
|
|
op->indata_len -= offset_inc;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_extent_dup_last);
|
|
|
|
|
2013-04-05 14:27:11 +08:00
|
|
|
void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
|
2013-04-06 03:46:02 +08:00
|
|
|
u16 opcode, const char *class, const char *method)
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
{
|
2015-04-27 11:09:54 +08:00
|
|
|
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
|
|
|
opcode, 0);
|
2013-04-05 14:27:12 +08:00
|
|
|
struct ceph_pagelist *pagelist;
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
size_t payload_len = 0;
|
|
|
|
size_t size;
|
|
|
|
|
|
|
|
BUG_ON(opcode != CEPH_OSD_OP_CALL);
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
|
|
|
|
BUG_ON(!pagelist);
|
|
|
|
ceph_pagelist_init(pagelist);
|
|
|
|
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
op->cls.class_name = class;
|
|
|
|
size = strlen(class);
|
|
|
|
BUG_ON(size > (size_t) U8_MAX);
|
|
|
|
op->cls.class_len = size;
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_pagelist_append(pagelist, class, size);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
payload_len += size;
|
|
|
|
|
|
|
|
op->cls.method_name = method;
|
|
|
|
size = strlen(method);
|
|
|
|
BUG_ON(size > (size_t) U8_MAX);
|
|
|
|
op->cls.method_len = size;
|
2013-04-05 14:27:12 +08:00
|
|
|
ceph_pagelist_append(pagelist, method, size);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
payload_len += size;
|
|
|
|
|
2013-04-05 14:27:12 +08:00
|
|
|
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
|
2013-04-05 14:27:12 +08:00
|
|
|
|
2016-02-08 20:39:46 +08:00
|
|
|
op->indata_len = payload_len;
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_cls_init);
|
2013-04-03 14:28:58 +08:00
|
|
|
|
2014-11-12 14:00:43 +08:00
|
|
|
int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
|
|
|
|
u16 opcode, const char *name, const void *value,
|
|
|
|
size_t size, u8 cmp_op, u8 cmp_mode)
|
|
|
|
{
|
2015-04-27 11:09:54 +08:00
|
|
|
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
|
|
|
opcode, 0);
|
2014-11-12 14:00:43 +08:00
|
|
|
struct ceph_pagelist *pagelist;
|
|
|
|
size_t payload_len;
|
|
|
|
|
|
|
|
BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
|
|
|
|
|
|
|
|
pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
|
|
|
|
if (!pagelist)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ceph_pagelist_init(pagelist);
|
|
|
|
|
|
|
|
payload_len = strlen(name);
|
|
|
|
op->xattr.name_len = payload_len;
|
|
|
|
ceph_pagelist_append(pagelist, name, payload_len);
|
|
|
|
|
|
|
|
op->xattr.value_len = size;
|
|
|
|
ceph_pagelist_append(pagelist, value, size);
|
|
|
|
payload_len += size;
|
|
|
|
|
|
|
|
op->xattr.cmp_op = cmp_op;
|
|
|
|
op->xattr.cmp_mode = cmp_mode;
|
|
|
|
|
|
|
|
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
|
2016-02-08 20:39:46 +08:00
|
|
|
op->indata_len = payload_len;
|
2014-11-12 14:00:43 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_xattr_init);
|
|
|
|
|
2013-04-05 14:27:11 +08:00
|
|
|
void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which, u16 opcode,
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
u64 cookie, u64 version, int flag)
|
|
|
|
{
|
2015-04-27 11:09:54 +08:00
|
|
|
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
|
|
|
opcode, 0);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
|
2013-04-05 14:27:11 +08:00
|
|
|
BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
|
|
|
|
op->watch.cookie = cookie;
|
2013-04-22 05:51:50 +08:00
|
|
|
op->watch.ver = version;
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
if (opcode == CEPH_OSD_OP_WATCH && flag)
|
2013-04-05 14:27:11 +08:00
|
|
|
op->watch.flag = (u8)1;
|
libceph: define source request op functions
The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation). When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.
This patch implements both of Josh's suggestions. It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces. Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.
The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines. The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.
For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.
Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used. Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed. We can add it back again later
if needed, when we know it's been tested.
This (and a few follow-on patches) resolves:
http://tracker.ceph.com/issues/3861
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-14 09:50:00 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_watch_init);
|
|
|
|
|
2014-02-25 22:22:27 +08:00
|
|
|
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
|
|
|
|
unsigned int which,
|
|
|
|
u64 expected_object_size,
|
|
|
|
u64 expected_write_size)
|
|
|
|
{
|
|
|
|
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
2015-04-27 11:09:54 +08:00
|
|
|
CEPH_OSD_OP_SETALLOCHINT,
|
|
|
|
0);
|
2014-02-25 22:22:27 +08:00
|
|
|
|
|
|
|
op->alloc_hint.expected_object_size = expected_object_size;
|
|
|
|
op->alloc_hint.expected_write_size = expected_write_size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
|
|
|
|
* not worth a feature bit. Set FAILOK per-op flag to make
|
|
|
|
* sure older osds don't trip over an unsupported opcode.
|
|
|
|
*/
|
|
|
|
op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
|
|
|
|
|
2013-04-06 03:46:01 +08:00
|
|
|
static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
|
2013-04-05 14:27:12 +08:00
|
|
|
struct ceph_osd_data *osd_data)
|
|
|
|
{
|
|
|
|
u64 length = ceph_osd_data_length(osd_data);
|
|
|
|
|
|
|
|
if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
|
|
|
|
BUG_ON(length > (u64) SIZE_MAX);
|
|
|
|
if (length)
|
2013-04-06 03:46:01 +08:00
|
|
|
ceph_msg_data_add_pages(msg, osd_data->pages,
|
2013-04-05 14:27:12 +08:00
|
|
|
length, osd_data->alignment);
|
|
|
|
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
|
|
|
|
BUG_ON(!length);
|
2013-04-06 03:46:01 +08:00
|
|
|
ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
|
2013-04-05 14:27:12 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
|
2013-04-06 03:46:01 +08:00
|
|
|
ceph_msg_data_add_bio(msg, osd_data->bio, length);
|
2013-04-05 14:27:12 +08:00
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
static u32 osd_req_encode_op(struct ceph_osd_op *dst,
|
|
|
|
const struct ceph_osd_req_op *src)
|
2013-03-14 09:50:00 +08:00
|
|
|
{
|
|
|
|
if (WARN_ON(!osd_req_opcode_valid(src->op))) {
|
|
|
|
pr_err("unrecognized osd opcode %d\n", src->op);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (src->op) {
|
|
|
|
case CEPH_OSD_OP_STAT:
|
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_READ:
|
|
|
|
case CEPH_OSD_OP_WRITE:
|
2015-10-07 23:27:17 +08:00
|
|
|
case CEPH_OSD_OP_WRITEFULL:
|
2013-08-15 11:51:44 +08:00
|
|
|
case CEPH_OSD_OP_ZERO:
|
|
|
|
case CEPH_OSD_OP_TRUNCATE:
|
2013-03-14 09:50:00 +08:00
|
|
|
dst->extent.offset = cpu_to_le64(src->extent.offset);
|
|
|
|
dst->extent.length = cpu_to_le64(src->extent.length);
|
|
|
|
dst->extent.truncate_size =
|
|
|
|
cpu_to_le64(src->extent.truncate_size);
|
|
|
|
dst->extent.truncate_seq =
|
|
|
|
cpu_to_le32(src->extent.truncate_seq);
|
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_CALL:
|
|
|
|
dst->cls.class_len = src->cls.class_len;
|
|
|
|
dst->cls.method_len = src->cls.method_len;
|
2016-05-26 06:29:52 +08:00
|
|
|
dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
|
2013-03-14 09:50:00 +08:00
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_STARTSYNC:
|
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_NOTIFY_ACK:
|
|
|
|
case CEPH_OSD_OP_WATCH:
|
|
|
|
dst->watch.cookie = cpu_to_le64(src->watch.cookie);
|
|
|
|
dst->watch.ver = cpu_to_le64(src->watch.ver);
|
|
|
|
dst->watch.flag = src->watch.flag;
|
|
|
|
break;
|
2014-02-25 22:22:27 +08:00
|
|
|
case CEPH_OSD_OP_SETALLOCHINT:
|
|
|
|
dst->alloc_hint.expected_object_size =
|
|
|
|
cpu_to_le64(src->alloc_hint.expected_object_size);
|
|
|
|
dst->alloc_hint.expected_write_size =
|
|
|
|
cpu_to_le64(src->alloc_hint.expected_write_size);
|
|
|
|
break;
|
2014-11-12 14:00:43 +08:00
|
|
|
case CEPH_OSD_OP_SETXATTR:
|
|
|
|
case CEPH_OSD_OP_CMPXATTR:
|
|
|
|
dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
|
|
|
|
dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
|
|
|
|
dst->xattr.cmp_op = src->xattr.cmp_op;
|
|
|
|
dst->xattr.cmp_mode = src->xattr.cmp_mode;
|
|
|
|
break;
|
2014-11-13 10:47:25 +08:00
|
|
|
case CEPH_OSD_OP_CREATE:
|
|
|
|
case CEPH_OSD_OP_DELETE:
|
|
|
|
break;
|
2013-03-14 09:50:00 +08:00
|
|
|
default:
|
2013-02-16 01:42:30 +08:00
|
|
|
pr_err("unsupported osd opcode %s\n",
|
2013-03-05 01:08:29 +08:00
|
|
|
ceph_osd_op_name(src->op));
|
2013-02-16 01:42:30 +08:00
|
|
|
WARN_ON(1);
|
2013-03-14 09:50:00 +08:00
|
|
|
|
|
|
|
return 0;
|
2010-04-07 06:01:27 +08:00
|
|
|
}
|
2014-02-25 22:22:26 +08:00
|
|
|
|
2013-03-14 09:50:00 +08:00
|
|
|
dst->op = cpu_to_le16(src->op);
|
2014-02-25 22:22:26 +08:00
|
|
|
dst->flags = cpu_to_le32(src->flags);
|
2016-02-08 20:39:46 +08:00
|
|
|
dst->payload_len = cpu_to_le32(src->indata_len);
|
2013-03-09 03:35:36 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
return src->indata_len;
|
2010-04-07 06:01:27 +08:00
|
|
|
}
|
|
|
|
|
2010-04-07 05:51:47 +08:00
|
|
|
/*
|
|
|
|
* build new request AND message, calculate layout, and adjust file
|
|
|
|
* extent as needed.
|
|
|
|
*
|
|
|
|
* if the file was recently truncated, we include information about its
|
|
|
|
* old and new size so that the object can be updated appropriately. (we
|
|
|
|
* avoid synchronously deleting truncated objects because it's slow.)
|
|
|
|
*
|
|
|
|
* if @do_sync, include a 'startsync' command so that the osd will flush
|
|
|
|
* data quickly.
|
|
|
|
*/
|
|
|
|
struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_file_layout *layout,
|
|
|
|
struct ceph_vino vino,
|
2014-11-13 14:40:37 +08:00
|
|
|
u64 off, u64 *plen,
|
|
|
|
unsigned int which, int num_ops,
|
2010-04-07 05:51:47 +08:00
|
|
|
int opcode, int flags,
|
|
|
|
struct ceph_snap_context *snapc,
|
|
|
|
u32 truncate_seq,
|
|
|
|
u64 truncate_size,
|
libceph: don't assign page info in ceph_osdc_new_request()
Currently ceph_osdc_new_request() assigns an osd request's
r_num_pages and r_alignment fields. The only thing it does
after that is call ceph_osdc_build_request(), and that doesn't
need those fields to be assigned.
Move the assignment of those fields out of ceph_osdc_new_request()
and into its caller. As a result, the page_align parameter is no
longer used, so get rid of it.
Note that in ceph_sync_write(), the value for req->r_num_pages had
already been calculated earlier (as num_pages, and fortunately
it was computed the same way). So don't bother recomputing it,
but because it's not needed earlier, move that calculation after the
call to ceph_osdc_new_request(). Hold off making the assignment to
r_alignment, doing it instead r_pages and r_num_pages are
getting set.
Similarly, in start_read(), nr_pages already holds the number of
pages in the array (and is calculated the same way), so there's no
need to recompute it. Move the assignment of the page alignment
down with the others there as well.
This and the next few patches are preparation work for:
http://tracker.ceph.com/issues/4127
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-02 08:00:15 +08:00
|
|
|
bool use_mempool)
|
2010-04-07 05:51:47 +08:00
|
|
|
{
|
2010-04-07 06:01:27 +08:00
|
|
|
struct ceph_osd_request *req;
|
2013-03-14 09:50:00 +08:00
|
|
|
u64 objnum = 0;
|
|
|
|
u64 objoff = 0;
|
|
|
|
u64 objlen = 0;
|
2012-09-25 12:01:02 +08:00
|
|
|
int r;
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-08-15 11:51:44 +08:00
|
|
|
BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
|
2014-11-13 10:47:25 +08:00
|
|
|
opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
|
|
|
|
opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
|
2010-04-07 06:01:27 +08:00
|
|
|
|
2013-03-15 03:09:05 +08:00
|
|
|
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
|
2012-11-14 11:11:15 +08:00
|
|
|
GFP_NOFS);
|
2016-04-27 20:15:51 +08:00
|
|
|
if (!req) {
|
|
|
|
r = -ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
2013-04-04 10:32:51 +08:00
|
|
|
|
2010-04-07 05:51:47 +08:00
|
|
|
/* calculate max write size */
|
2013-03-14 09:50:01 +08:00
|
|
|
r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
|
2016-04-27 20:15:51 +08:00
|
|
|
if (r)
|
|
|
|
goto fail;
|
2013-03-14 09:50:01 +08:00
|
|
|
|
2014-11-13 10:47:25 +08:00
|
|
|
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
|
2015-04-27 11:09:54 +08:00
|
|
|
osd_req_op_init(req, which, opcode, 0);
|
2014-11-13 10:47:25 +08:00
|
|
|
} else {
|
|
|
|
u32 object_size = le32_to_cpu(layout->fl_object_size);
|
|
|
|
u32 object_base = off - objoff;
|
|
|
|
if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
|
|
|
|
if (truncate_size <= object_base) {
|
|
|
|
truncate_size = 0;
|
|
|
|
} else {
|
|
|
|
truncate_size -= object_base;
|
|
|
|
if (truncate_size > object_size)
|
|
|
|
truncate_size = object_size;
|
|
|
|
}
|
2013-06-02 18:40:23 +08:00
|
|
|
}
|
2014-11-13 14:40:37 +08:00
|
|
|
osd_req_op_extent_init(req, which, opcode, objoff, objlen,
|
2014-11-13 10:47:25 +08:00
|
|
|
truncate_size, truncate_seq);
|
|
|
|
}
|
2013-03-14 09:50:01 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
req->r_flags = flags;
|
2014-01-27 23:40:20 +08:00
|
|
|
req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
|
2016-04-30 01:54:20 +08:00
|
|
|
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
|
2013-02-16 12:10:17 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
req->r_snapid = vino.snap;
|
|
|
|
if (flags & CEPH_OSD_FLAG_WRITE)
|
|
|
|
req->r_data_offset = off;
|
|
|
|
|
2016-04-27 20:15:51 +08:00
|
|
|
r = ceph_osdc_alloc_messages(req, GFP_NOFS);
|
|
|
|
if (r)
|
|
|
|
goto fail;
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
return req;
|
2016-04-27 20:15:51 +08:00
|
|
|
|
|
|
|
fail:
|
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
return ERR_PTR(r);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_new_request);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We keep osd requests in an rbtree, sorted by ->r_tid.
|
|
|
|
*/
|
2016-04-28 22:07:22 +08:00
|
|
|
DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
static struct ceph_osd_request *
|
|
|
|
__lookup_request_ge(struct ceph_osd_client *osdc,
|
|
|
|
u64 tid)
|
|
|
|
{
|
|
|
|
struct ceph_osd_request *req;
|
|
|
|
struct rb_node *n = osdc->requests.rb_node;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
req = rb_entry(n, struct ceph_osd_request, r_node);
|
|
|
|
if (tid < req->r_tid) {
|
|
|
|
if (!n->rb_left)
|
|
|
|
return req;
|
|
|
|
n = n->rb_left;
|
|
|
|
} else if (tid > req->r_tid) {
|
|
|
|
n = n->rb_right;
|
|
|
|
} else {
|
|
|
|
return req;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-09-03 18:41:45 +08:00
|
|
|
static void __kick_linger_request(struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc = req->r_osdc;
|
|
|
|
struct ceph_osd *osd = req->r_osd;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Linger requests need to be resent with a new tid to avoid
|
|
|
|
* the dup op detection logic on the OSDs. Achieve this with
|
|
|
|
* a re-register dance instead of open-coding.
|
|
|
|
*/
|
|
|
|
ceph_osdc_get_request(req);
|
|
|
|
if (!list_empty(&req->r_linger_item))
|
|
|
|
__unregister_linger_request(osdc, req);
|
|
|
|
else
|
|
|
|
__unregister_request(osdc, req);
|
|
|
|
__register_request(osdc, req);
|
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unless request has been registered as both normal and
|
|
|
|
* lingering, __unregister{,_linger}_request clears r_osd.
|
|
|
|
* However, here we need to preserve r_osd to make sure we
|
|
|
|
* requeue on the same OSD.
|
|
|
|
*/
|
|
|
|
WARN_ON(req->r_osd || !osd);
|
|
|
|
req->r_osd = osd;
|
|
|
|
|
|
|
|
dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid);
|
|
|
|
__enqueue_request(req);
|
|
|
|
}
|
|
|
|
|
2011-01-18 12:34:08 +08:00
|
|
|
/*
|
|
|
|
* Resubmit requests pending on the given osd.
|
|
|
|
*/
|
|
|
|
static void __kick_osd_requests(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd *osd)
|
|
|
|
{
|
2011-03-22 06:07:16 +08:00
|
|
|
struct ceph_osd_request *req, *nreq;
|
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given
client to arrive in order, with the tid for each request being
greater than the tid for requests that have already arrived. This
patch fixes two places the osd client might not maintain that
ordering.
For the osd client, the connection fault method is osd_reset().
That function calls __reset_osd() to close and re-open the
connection, then calls __kick_osd_requests() to cause all
outstanding requests for the affected osd to be re-sent after
the connection has been re-established.
When an osd is reset, any in-flight messages will need to be
re-sent. An osd client maintains distinct lists for unsent and
in-flight messages. Meanwhile, an osd maintains a single list of
all its requests (both sent and un-sent). (Each message is linked
into two lists--one for the osd client and one list for the osd.)
To process an osd "kick" operation, the request list for the *osd*
is traversed, and each request is moved off whichever osd *client*
list it was on (unsent or sent) and placed onto the osd client's
unsent list. (It remains where it is on the osd's request list.)
When that is done, osd_reset() calls __send_queued() to cause each
of the osd client's unsent messages to be sent.
OK, with that background...
As the osd request list is traversed each request is prepended to
the osd client's unsent list in the order they're seen. The effect
of this is to reverse the order of these requests as they are put
(back) onto the unsent list.
Instead, build up a list of only the requests for an osd that have
already been sent (by checking their r_sent flag values). Once an
unsent request is found, stop examining requests and prepend the
requests that need re-sending to the osd client's unsent list.
Preserve the original order of requests in the process (previously
re-queued requests were reversed in this process). Because they
have already been sent, they will have lower tids than any request
already present on the unsent list.
Just below that, traverse the linger list in forward order as
before, but add them to the *tail* of the list rather than the head.
These requests get re-registered, and in the process are give a new
(higher) tid, so the should go at the end.
This partially resolves:
http://tracker.ceph.com/issues/4392
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
2013-03-26 07:16:11 +08:00
|
|
|
LIST_HEAD(resend);
|
2014-09-03 18:41:45 +08:00
|
|
|
LIST_HEAD(resend_linger);
|
2011-01-18 12:34:08 +08:00
|
|
|
int err;
|
|
|
|
|
2014-09-03 18:41:45 +08:00
|
|
|
dout("%s osd%d\n", __func__, osd->o_osd);
|
2011-01-18 12:34:08 +08:00
|
|
|
err = __reset_osd(osdc, osd);
|
2012-12-07 23:57:58 +08:00
|
|
|
if (err)
|
2011-01-18 12:34:08 +08:00
|
|
|
return;
|
2014-09-03 18:41:45 +08:00
|
|
|
|
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given
client to arrive in order, with the tid for each request being
greater than the tid for requests that have already arrived. This
patch fixes two places the osd client might not maintain that
ordering.
For the osd client, the connection fault method is osd_reset().
That function calls __reset_osd() to close and re-open the
connection, then calls __kick_osd_requests() to cause all
outstanding requests for the affected osd to be re-sent after
the connection has been re-established.
When an osd is reset, any in-flight messages will need to be
re-sent. An osd client maintains distinct lists for unsent and
in-flight messages. Meanwhile, an osd maintains a single list of
all its requests (both sent and un-sent). (Each message is linked
into two lists--one for the osd client and one list for the osd.)
To process an osd "kick" operation, the request list for the *osd*
is traversed, and each request is moved off whichever osd *client*
list it was on (unsent or sent) and placed onto the osd client's
unsent list. (It remains where it is on the osd's request list.)
When that is done, osd_reset() calls __send_queued() to cause each
of the osd client's unsent messages to be sent.
OK, with that background...
As the osd request list is traversed each request is prepended to
the osd client's unsent list in the order they're seen. The effect
of this is to reverse the order of these requests as they are put
(back) onto the unsent list.
Instead, build up a list of only the requests for an osd that have
already been sent (by checking their r_sent flag values). Once an
unsent request is found, stop examining requests and prepend the
requests that need re-sending to the osd client's unsent list.
Preserve the original order of requests in the process (previously
re-queued requests were reversed in this process). Because they
have already been sent, they will have lower tids than any request
already present on the unsent list.
Just below that, traverse the linger list in forward order as
before, but add them to the *tail* of the list rather than the head.
These requests get re-registered, and in the process are give a new
(higher) tid, so the should go at the end.
This partially resolves:
http://tracker.ceph.com/issues/4392
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
2013-03-26 07:16:11 +08:00
|
|
|
/*
|
|
|
|
* Build up a list of requests to resend by traversing the
|
|
|
|
* osd's list of requests. Requests for a given object are
|
|
|
|
* sent in tid order, and that is also the order they're
|
|
|
|
* kept on this list. Therefore all requests that are in
|
|
|
|
* flight will be found first, followed by all requests that
|
|
|
|
* have not yet been sent. And to resend requests while
|
|
|
|
* preserving this order we will want to put any sent
|
|
|
|
* requests back on the front of the osd client's unsent
|
|
|
|
* list.
|
|
|
|
*
|
|
|
|
* So we build a separate ordered list of already-sent
|
|
|
|
* requests for the affected osd and splice it onto the
|
|
|
|
* front of the osd client's unsent list. Once we've seen a
|
|
|
|
* request that has not yet been sent we're done. Those
|
|
|
|
* requests are already sitting right where they belong.
|
|
|
|
*/
|
2011-01-18 12:34:08 +08:00
|
|
|
list_for_each_entry(req, &osd->o_requests, r_osd_item) {
|
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given
client to arrive in order, with the tid for each request being
greater than the tid for requests that have already arrived. This
patch fixes two places the osd client might not maintain that
ordering.
For the osd client, the connection fault method is osd_reset().
That function calls __reset_osd() to close and re-open the
connection, then calls __kick_osd_requests() to cause all
outstanding requests for the affected osd to be re-sent after
the connection has been re-established.
When an osd is reset, any in-flight messages will need to be
re-sent. An osd client maintains distinct lists for unsent and
in-flight messages. Meanwhile, an osd maintains a single list of
all its requests (both sent and un-sent). (Each message is linked
into two lists--one for the osd client and one list for the osd.)
To process an osd "kick" operation, the request list for the *osd*
is traversed, and each request is moved off whichever osd *client*
list it was on (unsent or sent) and placed onto the osd client's
unsent list. (It remains where it is on the osd's request list.)
When that is done, osd_reset() calls __send_queued() to cause each
of the osd client's unsent messages to be sent.
OK, with that background...
As the osd request list is traversed each request is prepended to
the osd client's unsent list in the order they're seen. The effect
of this is to reverse the order of these requests as they are put
(back) onto the unsent list.
Instead, build up a list of only the requests for an osd that have
already been sent (by checking their r_sent flag values). Once an
unsent request is found, stop examining requests and prepend the
requests that need re-sending to the osd client's unsent list.
Preserve the original order of requests in the process (previously
re-queued requests were reversed in this process). Because they
have already been sent, they will have lower tids than any request
already present on the unsent list.
Just below that, traverse the linger list in forward order as
before, but add them to the *tail* of the list rather than the head.
These requests get re-registered, and in the process are give a new
(higher) tid, so the should go at the end.
This partially resolves:
http://tracker.ceph.com/issues/4392
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
2013-03-26 07:16:11 +08:00
|
|
|
if (!req->r_sent)
|
|
|
|
break;
|
2014-09-03 18:41:45 +08:00
|
|
|
|
|
|
|
if (!req->r_linger) {
|
|
|
|
dout("%s requeueing %p tid %llu\n", __func__, req,
|
|
|
|
req->r_tid);
|
|
|
|
list_move_tail(&req->r_req_lru_item, &resend);
|
2011-03-22 06:07:16 +08:00
|
|
|
req->r_flags |= CEPH_OSD_FLAG_RETRY;
|
2014-09-03 18:41:45 +08:00
|
|
|
} else {
|
|
|
|
list_move_tail(&req->r_req_lru_item, &resend_linger);
|
|
|
|
}
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given
client to arrive in order, with the tid for each request being
greater than the tid for requests that have already arrived. This
patch fixes two places the osd client might not maintain that
ordering.
For the osd client, the connection fault method is osd_reset().
That function calls __reset_osd() to close and re-open the
connection, then calls __kick_osd_requests() to cause all
outstanding requests for the affected osd to be re-sent after
the connection has been re-established.
When an osd is reset, any in-flight messages will need to be
re-sent. An osd client maintains distinct lists for unsent and
in-flight messages. Meanwhile, an osd maintains a single list of
all its requests (both sent and un-sent). (Each message is linked
into two lists--one for the osd client and one list for the osd.)
To process an osd "kick" operation, the request list for the *osd*
is traversed, and each request is moved off whichever osd *client*
list it was on (unsent or sent) and placed onto the osd client's
unsent list. (It remains where it is on the osd's request list.)
When that is done, osd_reset() calls __send_queued() to cause each
of the osd client's unsent messages to be sent.
OK, with that background...
As the osd request list is traversed each request is prepended to
the osd client's unsent list in the order they're seen. The effect
of this is to reverse the order of these requests as they are put
(back) onto the unsent list.
Instead, build up a list of only the requests for an osd that have
already been sent (by checking their r_sent flag values). Once an
unsent request is found, stop examining requests and prepend the
requests that need re-sending to the osd client's unsent list.
Preserve the original order of requests in the process (previously
re-queued requests were reversed in this process). Because they
have already been sent, they will have lower tids than any request
already present on the unsent list.
Just below that, traverse the linger list in forward order as
before, but add them to the *tail* of the list rather than the head.
These requests get re-registered, and in the process are give a new
(higher) tid, so the should go at the end.
This partially resolves:
http://tracker.ceph.com/issues/4392
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
2013-03-26 07:16:11 +08:00
|
|
|
list_splice(&resend, &osdc->req_unsent);
|
2011-03-22 06:07:16 +08:00
|
|
|
|
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given
client to arrive in order, with the tid for each request being
greater than the tid for requests that have already arrived. This
patch fixes two places the osd client might not maintain that
ordering.
For the osd client, the connection fault method is osd_reset().
That function calls __reset_osd() to close and re-open the
connection, then calls __kick_osd_requests() to cause all
outstanding requests for the affected osd to be re-sent after
the connection has been re-established.
When an osd is reset, any in-flight messages will need to be
re-sent. An osd client maintains distinct lists for unsent and
in-flight messages. Meanwhile, an osd maintains a single list of
all its requests (both sent and un-sent). (Each message is linked
into two lists--one for the osd client and one list for the osd.)
To process an osd "kick" operation, the request list for the *osd*
is traversed, and each request is moved off whichever osd *client*
list it was on (unsent or sent) and placed onto the osd client's
unsent list. (It remains where it is on the osd's request list.)
When that is done, osd_reset() calls __send_queued() to cause each
of the osd client's unsent messages to be sent.
OK, with that background...
As the osd request list is traversed each request is prepended to
the osd client's unsent list in the order they're seen. The effect
of this is to reverse the order of these requests as they are put
(back) onto the unsent list.
Instead, build up a list of only the requests for an osd that have
already been sent (by checking their r_sent flag values). Once an
unsent request is found, stop examining requests and prepend the
requests that need re-sending to the osd client's unsent list.
Preserve the original order of requests in the process (previously
re-queued requests were reversed in this process). Because they
have already been sent, they will have lower tids than any request
already present on the unsent list.
Just below that, traverse the linger list in forward order as
before, but add them to the *tail* of the list rather than the head.
These requests get re-registered, and in the process are give a new
(higher) tid, so the should go at the end.
This partially resolves:
http://tracker.ceph.com/issues/4392
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
2013-03-26 07:16:11 +08:00
|
|
|
/*
|
2014-09-03 18:41:45 +08:00
|
|
|
* Both registered and not yet registered linger requests are
|
|
|
|
* enqueued with a new tid on the same OSD. We add/move them
|
|
|
|
* to req_unsent/o_requests at the end to keep things in tid
|
|
|
|
* order.
|
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given
client to arrive in order, with the tid for each request being
greater than the tid for requests that have already arrived. This
patch fixes two places the osd client might not maintain that
ordering.
For the osd client, the connection fault method is osd_reset().
That function calls __reset_osd() to close and re-open the
connection, then calls __kick_osd_requests() to cause all
outstanding requests for the affected osd to be re-sent after
the connection has been re-established.
When an osd is reset, any in-flight messages will need to be
re-sent. An osd client maintains distinct lists for unsent and
in-flight messages. Meanwhile, an osd maintains a single list of
all its requests (both sent and un-sent). (Each message is linked
into two lists--one for the osd client and one list for the osd.)
To process an osd "kick" operation, the request list for the *osd*
is traversed, and each request is moved off whichever osd *client*
list it was on (unsent or sent) and placed onto the osd client's
unsent list. (It remains where it is on the osd's request list.)
When that is done, osd_reset() calls __send_queued() to cause each
of the osd client's unsent messages to be sent.
OK, with that background...
As the osd request list is traversed each request is prepended to
the osd client's unsent list in the order they're seen. The effect
of this is to reverse the order of these requests as they are put
(back) onto the unsent list.
Instead, build up a list of only the requests for an osd that have
already been sent (by checking their r_sent flag values). Once an
unsent request is found, stop examining requests and prepend the
requests that need re-sending to the osd client's unsent list.
Preserve the original order of requests in the process (previously
re-queued requests were reversed in this process). Because they
have already been sent, they will have lower tids than any request
already present on the unsent list.
Just below that, traverse the linger list in forward order as
before, but add them to the *tail* of the list rather than the head.
These requests get re-registered, and in the process are give a new
(higher) tid, so the should go at the end.
This partially resolves:
http://tracker.ceph.com/issues/4392
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-off-by: Sage Weil <sage@inktank.com>
2013-03-26 07:16:11 +08:00
|
|
|
*/
|
2011-03-22 06:07:16 +08:00
|
|
|
list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
|
2014-06-20 18:14:41 +08:00
|
|
|
r_linger_osd_item) {
|
2014-09-03 18:41:45 +08:00
|
|
|
WARN_ON(!list_empty(&req->r_req_lru_item));
|
|
|
|
__kick_linger_request(req);
|
2011-01-18 12:34:08 +08:00
|
|
|
}
|
2014-09-03 18:41:45 +08:00
|
|
|
|
|
|
|
list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
|
|
|
|
__kick_linger_request(req);
|
2011-01-18 12:34:08 +08:00
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
2009-10-10 01:29:18 +08:00
|
|
|
* If the osd connection drops, we need to resubmit all requests.
|
2009-10-07 02:31:10 +08:00
|
|
|
*/
|
|
|
|
static void osd_reset(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_osd *osd = con->private;
|
|
|
|
struct ceph_osd_client *osdc;
|
|
|
|
|
|
|
|
if (!osd)
|
|
|
|
return;
|
|
|
|
dout("osd_reset osd%d\n", osd->o_osd);
|
|
|
|
osdc = osd->o_osdc;
|
|
|
|
down_read(&osdc->map_sem);
|
2012-11-29 04:28:24 +08:00
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
__kick_osd_requests(osdc, osd);
|
2013-02-16 01:42:29 +08:00
|
|
|
__send_queued(osdc);
|
2012-11-29 04:28:24 +08:00
|
|
|
mutex_unlock(&osdc->request_mutex);
|
2009-10-07 02:31:10 +08:00
|
|
|
up_read(&osdc->map_sem);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Track open sessions with osds.
|
|
|
|
*/
|
2012-05-27 12:26:43 +08:00
|
|
|
static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd *osd;
|
|
|
|
|
|
|
|
osd = kzalloc(sizeof(*osd), GFP_NOFS);
|
|
|
|
if (!osd)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
atomic_set(&osd->o_ref, 1);
|
|
|
|
osd->o_osdc = osdc;
|
2012-05-27 12:26:43 +08:00
|
|
|
osd->o_osd = onum;
|
2012-12-06 21:22:04 +08:00
|
|
|
RB_CLEAR_NODE(&osd->o_node);
|
2009-10-07 02:31:10 +08:00
|
|
|
INIT_LIST_HEAD(&osd->o_requests);
|
2011-03-22 06:07:16 +08:00
|
|
|
INIT_LIST_HEAD(&osd->o_linger_requests);
|
2010-02-04 03:00:26 +08:00
|
|
|
INIT_LIST_HEAD(&osd->o_osd_lru);
|
2009-10-07 02:31:10 +08:00
|
|
|
osd->o_incarnation = 1;
|
|
|
|
|
2012-06-28 03:24:08 +08:00
|
|
|
ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
|
2009-11-19 08:19:57 +08:00
|
|
|
|
2010-02-27 07:32:31 +08:00
|
|
|
INIT_LIST_HEAD(&osd->o_keepalive_item);
|
2009-10-07 02:31:10 +08:00
|
|
|
return osd;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ceph_osd *get_osd(struct ceph_osd *osd)
|
|
|
|
{
|
|
|
|
if (atomic_inc_not_zero(&osd->o_ref)) {
|
|
|
|
dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
|
|
|
|
atomic_read(&osd->o_ref));
|
|
|
|
return osd;
|
|
|
|
} else {
|
|
|
|
dout("get_osd %p FAIL\n", osd);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void put_osd(struct ceph_osd *osd)
|
|
|
|
{
|
|
|
|
dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
|
|
|
|
atomic_read(&osd->o_ref) - 1);
|
2015-02-16 16:49:42 +08:00
|
|
|
if (atomic_dec_and_test(&osd->o_ref)) {
|
|
|
|
if (osd->o_auth.authorizer)
|
2016-04-12 01:34:49 +08:00
|
|
|
ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
|
2009-10-07 02:31:10 +08:00
|
|
|
kfree(osd);
|
2010-05-28 05:15:49 +08:00
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:22 +08:00
|
|
|
DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* remove an osd from our map
|
|
|
|
*/
|
2010-02-04 03:00:26 +08:00
|
|
|
static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2015-02-18 00:37:15 +08:00
|
|
|
dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
|
2014-11-06 00:33:44 +08:00
|
|
|
WARN_ON(!list_empty(&osd->o_requests));
|
|
|
|
WARN_ON(!list_empty(&osd->o_linger_requests));
|
2014-06-18 17:02:12 +08:00
|
|
|
|
2010-02-04 03:00:26 +08:00
|
|
|
list_del_init(&osd->o_osd_lru);
|
2016-04-28 22:07:22 +08:00
|
|
|
erase_osd(&osdc->osds, osd);
|
2015-02-18 00:37:15 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
|
|
|
|
{
|
|
|
|
dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
|
|
|
|
|
|
|
|
if (!RB_EMPTY_NODE(&osd->o_node)) {
|
|
|
|
ceph_con_close(&osd->o_con);
|
|
|
|
__remove_osd(osdc, osd);
|
|
|
|
put_osd(osd);
|
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2010-02-04 03:00:26 +08:00
|
|
|
static void __move_osd_to_lru(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd *osd)
|
|
|
|
{
|
2014-06-20 18:14:41 +08:00
|
|
|
dout("%s %p\n", __func__, osd);
|
2010-02-04 03:00:26 +08:00
|
|
|
BUG_ON(!list_empty(&osd->o_osd_lru));
|
2014-06-20 18:14:41 +08:00
|
|
|
|
2010-02-04 03:00:26 +08:00
|
|
|
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
|
2015-05-15 17:02:17 +08:00
|
|
|
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
|
2010-02-04 03:00:26 +08:00
|
|
|
}
|
|
|
|
|
2014-06-20 18:14:41 +08:00
|
|
|
static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd *osd)
|
|
|
|
{
|
|
|
|
dout("%s %p\n", __func__, osd);
|
|
|
|
|
|
|
|
if (list_empty(&osd->o_requests) &&
|
|
|
|
list_empty(&osd->o_linger_requests))
|
|
|
|
__move_osd_to_lru(osdc, osd);
|
|
|
|
}
|
|
|
|
|
2010-02-04 03:00:26 +08:00
|
|
|
static void __remove_osd_from_lru(struct ceph_osd *osd)
|
|
|
|
{
|
|
|
|
dout("__remove_osd_from_lru %p\n", osd);
|
|
|
|
if (!list_empty(&osd->o_osd_lru))
|
|
|
|
list_del_init(&osd->o_osd_lru);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* reset osd connect
|
|
|
|
*/
|
2010-02-04 03:00:26 +08:00
|
|
|
static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2012-12-07 23:57:58 +08:00
|
|
|
struct ceph_entity_addr *peer_addr;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2010-02-04 03:00:26 +08:00
|
|
|
dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
|
2011-03-22 06:07:16 +08:00
|
|
|
if (list_empty(&osd->o_requests) &&
|
|
|
|
list_empty(&osd->o_linger_requests)) {
|
2015-02-18 00:37:15 +08:00
|
|
|
remove_osd(osdc, osd);
|
2012-12-07 23:57:58 +08:00
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
|
|
|
peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
|
|
|
|
if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
|
|
|
|
!ceph_con_opened(&osd->o_con)) {
|
|
|
|
struct ceph_osd_request *req;
|
|
|
|
|
2014-01-17 01:18:27 +08:00
|
|
|
dout("osd addr hasn't changed and connection never opened, "
|
|
|
|
"letting msgr retry\n");
|
ceph: avoid reopening osd connections when address hasn't changed
We get a fault callback on _every_ tcp connection fault. Normally, we
want to reopen the connection when that happens. If the address we have
is bad, however, and connection attempts always result in a connection
refused or similar error, explicitly closing and reopening the msgr
connection just prevents the messenger's backoff logic from kicking in.
The result can be a console full of
[ 3974.417106] ceph: osd11 10.3.14.138:6800 connection failed
[ 3974.423295] ceph: osd11 10.3.14.138:6800 connection failed
[ 3974.429709] ceph: osd11 10.3.14.138:6800 connection failed
Instead, if we get a fault, and have outstanding requests, but the osd
address hasn't changed and the connection never successfully connected in
the first place, do nothing to the osd connection. The messenger layer
will back off and retry periodically, because we never connected and thus
the lossy bit is not set.
Instead, touch each request's r_stamp so that handle_timeout can tell the
request is still alive and kicking.
Signed-off-by: Sage Weil <sage@newdream.net>
2010-03-23 05:51:18 +08:00
|
|
|
/* touch each r_stamp for handle_timeout()'s benfit */
|
|
|
|
list_for_each_entry(req, &osd->o_requests, r_osd_item)
|
|
|
|
req->r_stamp = jiffies;
|
2012-12-07 23:57:58 +08:00
|
|
|
|
|
|
|
return -EAGAIN;
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2012-12-07 23:57:58 +08:00
|
|
|
|
|
|
|
ceph_con_close(&osd->o_con);
|
|
|
|
ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
|
|
|
|
osd->o_incarnation++;
|
|
|
|
|
|
|
|
return 0;
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2010-02-27 07:32:31 +08:00
|
|
|
static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
|
|
|
|
{
|
|
|
|
schedule_delayed_work(&osdc->timeout_work,
|
2015-05-15 17:02:17 +08:00
|
|
|
osdc->client->options->osd_keepalive_timeout);
|
2010-02-27 07:32:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
|
|
|
|
{
|
|
|
|
cancel_delayed_work(&osdc->timeout_work);
|
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register request, assign tid. If this is the first request, set up
|
|
|
|
* the timeout event.
|
|
|
|
*/
|
2011-03-22 06:07:16 +08:00
|
|
|
static void __register_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
req->r_tid = ++osdc->last_tid;
|
2009-12-23 03:24:33 +08:00
|
|
|
req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
|
2011-04-07 00:09:16 +08:00
|
|
|
dout("__register_request %p tid %lld\n", req, req->r_tid);
|
2016-04-28 22:07:22 +08:00
|
|
|
insert_request(&osdc->requests, req);
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_osdc_get_request(req);
|
|
|
|
osdc->num_requests++;
|
|
|
|
if (osdc->num_requests == 1) {
|
2010-02-27 07:32:31 +08:00
|
|
|
dout(" first request, scheduling timeout\n");
|
|
|
|
__schedule_osd_timeout(osdc);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* called under osdc->request_mutex
|
|
|
|
*/
|
|
|
|
static void __unregister_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req)
|
|
|
|
{
|
2012-05-17 04:16:38 +08:00
|
|
|
if (RB_EMPTY_NODE(&req->r_node)) {
|
|
|
|
dout("__unregister_request %p tid %lld not registered\n",
|
|
|
|
req, req->r_tid);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
dout("__unregister_request %p tid %lld\n", req, req->r_tid);
|
2016-04-28 22:07:22 +08:00
|
|
|
erase_request(&osdc->requests, req);
|
2009-10-07 02:31:10 +08:00
|
|
|
osdc->num_requests--;
|
|
|
|
|
2009-10-09 07:57:16 +08:00
|
|
|
if (req->r_osd) {
|
|
|
|
/* make sure the original request isn't in flight. */
|
2012-06-02 03:56:43 +08:00
|
|
|
ceph_msg_revoke(req->r_request);
|
2009-10-09 07:57:16 +08:00
|
|
|
|
|
|
|
list_del_init(&req->r_osd_item);
|
2014-06-20 18:14:41 +08:00
|
|
|
maybe_move_osd_to_lru(osdc, req->r_osd);
|
2014-06-20 22:29:20 +08:00
|
|
|
if (list_empty(&req->r_linger_osd_item))
|
2011-03-22 06:07:16 +08:00
|
|
|
req->r_osd = NULL;
|
2009-10-09 07:57:16 +08:00
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2012-11-29 22:37:03 +08:00
|
|
|
list_del_init(&req->r_req_lru_item);
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
|
2010-02-27 07:32:31 +08:00
|
|
|
if (osdc->num_requests == 0) {
|
|
|
|
dout(" no requests, canceling timeout\n");
|
|
|
|
__cancel_osd_timeout(osdc);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cancel a previously queued request message
|
|
|
|
*/
|
|
|
|
static void __cancel_request(struct ceph_osd_request *req)
|
|
|
|
{
|
2010-09-28 01:18:52 +08:00
|
|
|
if (req->r_sent && req->r_osd) {
|
2012-06-02 03:56:43 +08:00
|
|
|
ceph_msg_revoke(req->r_request);
|
2009-10-07 02:31:10 +08:00
|
|
|
req->r_sent = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-22 06:07:16 +08:00
|
|
|
static void __register_linger_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req)
|
|
|
|
{
|
2014-06-20 22:29:20 +08:00
|
|
|
dout("%s %p tid %llu\n", __func__, req, req->r_tid);
|
|
|
|
WARN_ON(!req->r_linger);
|
|
|
|
|
2013-05-23 09:54:25 +08:00
|
|
|
ceph_osdc_get_request(req);
|
2011-03-22 06:07:16 +08:00
|
|
|
list_add_tail(&req->r_linger_item, &osdc->req_linger);
|
2012-07-31 07:19:28 +08:00
|
|
|
if (req->r_osd)
|
2014-06-20 18:14:41 +08:00
|
|
|
list_add_tail(&req->r_linger_osd_item,
|
2012-07-31 07:19:28 +08:00
|
|
|
&req->r_osd->o_linger_requests);
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __unregister_linger_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req)
|
|
|
|
{
|
2014-06-20 22:29:20 +08:00
|
|
|
WARN_ON(!req->r_linger);
|
|
|
|
|
|
|
|
if (list_empty(&req->r_linger_item)) {
|
|
|
|
dout("%s %p tid %llu not registered\n", __func__, req,
|
|
|
|
req->r_tid);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
dout("%s %p tid %llu\n", __func__, req, req->r_tid);
|
2012-12-06 23:37:23 +08:00
|
|
|
list_del_init(&req->r_linger_item);
|
2014-06-20 22:29:20 +08:00
|
|
|
|
2011-03-22 06:07:16 +08:00
|
|
|
if (req->r_osd) {
|
2014-06-20 18:14:41 +08:00
|
|
|
list_del_init(&req->r_linger_osd_item);
|
2014-06-20 18:14:41 +08:00
|
|
|
maybe_move_osd_to_lru(osdc, req->r_osd);
|
2011-03-30 03:11:06 +08:00
|
|
|
if (list_empty(&req->r_osd_item))
|
|
|
|
req->r_osd = NULL;
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
2013-05-23 09:54:25 +08:00
|
|
|
ceph_osdc_put_request(req);
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
if (!req->r_linger) {
|
|
|
|
dout("set_request_linger %p\n", req);
|
|
|
|
req->r_linger = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_set_request_linger);
|
|
|
|
|
2016-04-28 22:07:23 +08:00
|
|
|
static bool __pool_full(struct ceph_pg_pool_info *pi)
|
|
|
|
{
|
|
|
|
return pi->flags & CEPH_POOL_FLAG_FULL;
|
|
|
|
}
|
|
|
|
|
libceph: block I/O when PAUSE or FULL osd map flags are set
The PAUSEWR and PAUSERD flags are meant to stop the cluster from
processing writes and reads, respectively. The FULL flag is set when
the cluster determines that it is out of space, and will no longer
process writes. PAUSEWR and PAUSERD are purely client-side settings
already implemented in userspace clients. The osd does nothing special
with these flags.
When the FULL flag is set, however, the osd responds to all writes
with -ENOSPC. For cephfs, this makes sense, but for rbd the block
layer translates this into EIO. If a cluster goes from full to
non-full quickly, a filesystem on top of rbd will not behave well,
since some writes succeed while others get EIO.
Fix this by blocking any writes when the FULL flag is set in the osd
client. This is the same strategy used by userspace, so apply it by
default. A follow-on patch makes this configurable.
__map_request() is called to re-target osd requests in case the
available osds changed. Add a paused field to a ceph_osd_request, and
set it whenever an appropriate osd map flag is set. Avoid queueing
paused requests in __map_request(), but force them to be resent if
they become unpaused.
Also subscribe to the next osd map from the monitor if any of these
flags are set, so paused requests can be unblocked as soon as
possible.
Fixes: http://tracker.ceph.com/issues/6079
Reviewed-by: Sage Weil <sage@inktank.com>
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
2013-12-03 11:11:48 +08:00
|
|
|
/*
|
|
|
|
* Returns whether a request should be blocked from being sent
|
|
|
|
* based on the current osdmap and osd_client settings.
|
|
|
|
*
|
|
|
|
* Caller should hold map_sem for read.
|
|
|
|
*/
|
2016-04-28 22:07:23 +08:00
|
|
|
static bool target_should_be_paused(struct ceph_osd_client *osdc,
|
|
|
|
const struct ceph_osd_request_target *t,
|
|
|
|
struct ceph_pg_pool_info *pi)
|
|
|
|
{
|
|
|
|
bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
|
|
|
|
bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
|
|
|
|
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
|
|
|
|
__pool_full(pi);
|
|
|
|
|
|
|
|
WARN_ON(pi->id != t->base_oloc.pool);
|
|
|
|
return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
|
|
|
|
(t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
|
|
|
|
}
|
|
|
|
|
|
|
|
enum calc_target_result {
|
|
|
|
CALC_TARGET_NO_ACTION = 0,
|
|
|
|
CALC_TARGET_NEED_RESEND,
|
|
|
|
CALC_TARGET_POOL_DNE,
|
|
|
|
};
|
|
|
|
|
|
|
|
static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request_target *t,
|
|
|
|
u32 *last_force_resend,
|
|
|
|
bool any_change)
|
|
|
|
{
|
|
|
|
struct ceph_pg_pool_info *pi;
|
|
|
|
struct ceph_pg pgid, last_pgid;
|
|
|
|
struct ceph_osds up, acting;
|
|
|
|
bool force_resend = false;
|
|
|
|
bool need_check_tiering = false;
|
|
|
|
bool need_resend = false;
|
|
|
|
bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
|
|
|
|
CEPH_OSDMAP_SORTBITWISE);
|
|
|
|
enum calc_target_result ct_res;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
|
|
|
|
if (!pi) {
|
|
|
|
t->osd = CEPH_HOMELESS_OSD;
|
|
|
|
ct_res = CALC_TARGET_POOL_DNE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (osdc->osdmap->epoch == pi->last_force_request_resend) {
|
|
|
|
if (last_force_resend &&
|
|
|
|
*last_force_resend < pi->last_force_request_resend) {
|
|
|
|
*last_force_resend = pi->last_force_request_resend;
|
|
|
|
force_resend = true;
|
|
|
|
} else if (!last_force_resend) {
|
|
|
|
force_resend = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ceph_oid_empty(&t->target_oid) || force_resend) {
|
|
|
|
ceph_oid_copy(&t->target_oid, &t->base_oid);
|
|
|
|
need_check_tiering = true;
|
|
|
|
}
|
|
|
|
if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
|
|
|
|
ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
|
|
|
|
need_check_tiering = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (need_check_tiering &&
|
|
|
|
(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
|
|
|
|
if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
|
|
|
|
t->target_oloc.pool = pi->read_tier;
|
|
|
|
if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
|
|
|
|
t->target_oloc.pool = pi->write_tier;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
|
|
|
|
&t->target_oloc, &pgid);
|
|
|
|
if (ret) {
|
|
|
|
WARN_ON(ret != -ENOENT);
|
|
|
|
t->osd = CEPH_HOMELESS_OSD;
|
|
|
|
ct_res = CALC_TARGET_POOL_DNE;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
last_pgid.pool = pgid.pool;
|
|
|
|
last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
|
|
|
|
|
|
|
|
ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
|
|
|
|
if (any_change &&
|
|
|
|
ceph_is_new_interval(&t->acting,
|
|
|
|
&acting,
|
|
|
|
&t->up,
|
|
|
|
&up,
|
|
|
|
t->size,
|
|
|
|
pi->size,
|
|
|
|
t->min_size,
|
|
|
|
pi->min_size,
|
|
|
|
t->pg_num,
|
|
|
|
pi->pg_num,
|
|
|
|
t->sort_bitwise,
|
|
|
|
sort_bitwise,
|
|
|
|
&last_pgid))
|
|
|
|
force_resend = true;
|
|
|
|
|
|
|
|
if (t->paused && !target_should_be_paused(osdc, t, pi)) {
|
|
|
|
t->paused = false;
|
|
|
|
need_resend = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ceph_pg_compare(&t->pgid, &pgid) ||
|
|
|
|
ceph_osds_changed(&t->acting, &acting, any_change) ||
|
|
|
|
force_resend) {
|
|
|
|
t->pgid = pgid; /* struct */
|
|
|
|
ceph_osds_copy(&t->acting, &acting);
|
|
|
|
ceph_osds_copy(&t->up, &up);
|
|
|
|
t->size = pi->size;
|
|
|
|
t->min_size = pi->min_size;
|
|
|
|
t->pg_num = pi->pg_num;
|
|
|
|
t->pg_num_mask = pi->pg_num_mask;
|
|
|
|
t->sort_bitwise = sort_bitwise;
|
|
|
|
|
|
|
|
t->osd = acting.primary;
|
|
|
|
need_resend = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
|
|
|
|
out:
|
|
|
|
dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
|
|
|
|
return ct_res;
|
|
|
|
}
|
|
|
|
|
2014-09-02 17:40:33 +08:00
|
|
|
static void __enqueue_request(struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc = req->r_osdc;
|
|
|
|
|
|
|
|
dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid,
|
|
|
|
req->r_osd ? req->r_osd->o_osd : -1);
|
|
|
|
|
|
|
|
if (req->r_osd) {
|
|
|
|
__remove_osd_from_lru(req->r_osd);
|
|
|
|
list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
|
|
|
|
list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
|
|
|
|
} else {
|
|
|
|
list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
|
|
|
|
* (as needed), and set the request r_osd appropriately. If there is
|
2011-03-31 09:57:33 +08:00
|
|
|
* no up osd, set r_osd to NULL. Move the request to the appropriate list
|
2011-01-18 12:34:08 +08:00
|
|
|
* (unsent, homeless) or leave on in-flight lru.
|
2009-10-07 02:31:10 +08:00
|
|
|
*
|
|
|
|
* Return 0 if unchanged, 1 if changed, or negative on error.
|
|
|
|
*
|
|
|
|
* Caller should hold map_sem for read and request_mutex.
|
|
|
|
*/
|
2011-01-18 12:34:08 +08:00
|
|
|
static int __map_request(struct ceph_osd_client *osdc,
|
2011-10-15 04:33:55 +08:00
|
|
|
struct ceph_osd_request *req, int force_resend)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2016-04-28 22:07:23 +08:00
|
|
|
enum calc_target_result ct_res;
|
2009-10-07 02:31:10 +08:00
|
|
|
int err;
|
|
|
|
|
2011-01-18 12:34:08 +08:00
|
|
|
dout("map_request %p tid %lld\n", req, req->r_tid);
|
2014-01-27 23:40:19 +08:00
|
|
|
|
2016-04-28 22:07:23 +08:00
|
|
|
ct_res = calc_target(osdc, &req->r_t, NULL, force_resend);
|
|
|
|
switch (ct_res) {
|
|
|
|
case CALC_TARGET_POOL_DNE:
|
2011-01-18 12:34:08 +08:00
|
|
|
list_move(&req->r_req_lru_item, &osdc->req_notarget);
|
2016-04-28 22:07:23 +08:00
|
|
|
return -EIO;
|
|
|
|
case CALC_TARGET_NO_ACTION:
|
2009-10-07 02:31:10 +08:00
|
|
|
return 0; /* no change */
|
2016-04-28 22:07:23 +08:00
|
|
|
default:
|
|
|
|
BUG_ON(ct_res != CALC_TARGET_NEED_RESEND);
|
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-02-24 02:38:16 +08:00
|
|
|
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
|
2016-04-28 22:07:23 +08:00
|
|
|
req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, req->r_t.osd,
|
2009-10-07 02:31:10 +08:00
|
|
|
req->r_osd ? req->r_osd->o_osd : -1);
|
|
|
|
|
|
|
|
if (req->r_osd) {
|
|
|
|
__cancel_request(req);
|
|
|
|
list_del_init(&req->r_osd_item);
|
2014-11-04 23:32:14 +08:00
|
|
|
list_del_init(&req->r_linger_osd_item);
|
2009-10-07 02:31:10 +08:00
|
|
|
req->r_osd = NULL;
|
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:23 +08:00
|
|
|
req->r_osd = lookup_osd(&osdc->osds, req->r_t.osd);
|
|
|
|
if (!req->r_osd && req->r_t.osd >= 0) {
|
2010-02-27 01:37:33 +08:00
|
|
|
err = -ENOMEM;
|
2016-04-28 22:07:23 +08:00
|
|
|
req->r_osd = create_osd(osdc, req->r_t.osd);
|
2011-01-18 12:34:08 +08:00
|
|
|
if (!req->r_osd) {
|
|
|
|
list_move(&req->r_req_lru_item, &osdc->req_notarget);
|
2010-02-27 01:37:33 +08:00
|
|
|
goto out;
|
2011-01-18 12:34:08 +08:00
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-04-28 22:07:22 +08:00
|
|
|
dout("map_request osd %p is osd%d\n", req->r_osd,
|
2016-04-28 22:07:23 +08:00
|
|
|
req->r_osd->o_osd);
|
2016-04-28 22:07:22 +08:00
|
|
|
insert_osd(&osdc->osds, req->r_osd);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2012-06-28 03:24:08 +08:00
|
|
|
ceph_con_open(&req->r_osd->o_con,
|
2016-04-28 22:07:23 +08:00
|
|
|
CEPH_ENTITY_TYPE_OSD, req->r_osd->o_osd,
|
|
|
|
&osdc->osdmap->osd_addr[req->r_osd->o_osd]);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2014-09-02 17:40:33 +08:00
|
|
|
__enqueue_request(req);
|
2010-05-11 01:24:48 +08:00
|
|
|
err = 1; /* osd or pg changed */
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
static void setup_request_data(struct ceph_osd_request *req,
|
|
|
|
struct ceph_msg *msg)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2016-05-26 06:29:52 +08:00
|
|
|
u32 data_len = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!list_empty(&msg->data))
|
|
|
|
return;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
WARN_ON(msg->data_length);
|
|
|
|
for (i = 0; i < req->r_num_ops; i++) {
|
|
|
|
struct ceph_osd_req_op *op = &req->r_ops[i];
|
|
|
|
|
|
|
|
switch (op->op) {
|
|
|
|
/* request */
|
|
|
|
case CEPH_OSD_OP_WRITE:
|
|
|
|
case CEPH_OSD_OP_WRITEFULL:
|
|
|
|
WARN_ON(op->indata_len != op->extent.length);
|
|
|
|
ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
|
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_SETXATTR:
|
|
|
|
case CEPH_OSD_OP_CMPXATTR:
|
|
|
|
WARN_ON(op->indata_len != op->xattr.name_len +
|
|
|
|
op->xattr.value_len);
|
|
|
|
ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* reply */
|
|
|
|
case CEPH_OSD_OP_STAT:
|
|
|
|
ceph_osdc_msg_data_add(req->r_reply,
|
|
|
|
&op->raw_data_in);
|
|
|
|
break;
|
|
|
|
case CEPH_OSD_OP_READ:
|
|
|
|
ceph_osdc_msg_data_add(req->r_reply,
|
|
|
|
&op->extent.osd_data);
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* both */
|
|
|
|
case CEPH_OSD_OP_CALL:
|
|
|
|
WARN_ON(op->indata_len != op->cls.class_len +
|
|
|
|
op->cls.method_len +
|
|
|
|
op->cls.indata_len);
|
|
|
|
ceph_osdc_msg_data_add(msg, &op->cls.request_info);
|
|
|
|
/* optional, can be NONE */
|
|
|
|
ceph_osdc_msg_data_add(msg, &op->cls.request_data);
|
|
|
|
/* optional, can be NONE */
|
|
|
|
ceph_osdc_msg_data_add(req->r_reply,
|
|
|
|
&op->cls.response_data);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
data_len += op->indata_len;
|
|
|
|
}
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
WARN_ON(data_len != msg->data_length);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
|
|
|
|
{
|
|
|
|
void *p = msg->front.iov_base;
|
|
|
|
void *const end = p + msg->front_alloc_len;
|
|
|
|
u32 data_len = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
|
|
|
|
/* snapshots aren't writeable */
|
|
|
|
WARN_ON(req->r_snapid != CEPH_NOSNAP);
|
|
|
|
} else {
|
|
|
|
WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
|
|
|
|
req->r_data_offset || req->r_snapc);
|
|
|
|
}
|
|
|
|
|
|
|
|
setup_request_data(req, msg);
|
|
|
|
|
|
|
|
ceph_encode_32(&p, 1); /* client_inc, always 1 */
|
|
|
|
ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
|
|
|
|
ceph_encode_32(&p, req->r_flags);
|
|
|
|
ceph_encode_timespec(p, &req->r_mtime);
|
|
|
|
p += sizeof(struct ceph_timespec);
|
|
|
|
/* aka reassert_version */
|
|
|
|
memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
|
|
|
|
p += sizeof(req->r_replay_version);
|
|
|
|
|
|
|
|
/* oloc */
|
|
|
|
ceph_encode_8(&p, 4);
|
|
|
|
ceph_encode_8(&p, 4);
|
|
|
|
ceph_encode_32(&p, 8 + 4 + 4);
|
|
|
|
ceph_encode_64(&p, req->r_t.target_oloc.pool);
|
|
|
|
ceph_encode_32(&p, -1); /* preferred */
|
|
|
|
ceph_encode_32(&p, 0); /* key len */
|
|
|
|
|
|
|
|
/* pgid */
|
|
|
|
ceph_encode_8(&p, 1);
|
2016-04-28 22:07:23 +08:00
|
|
|
ceph_encode_64(&p, req->r_t.pgid.pool);
|
|
|
|
ceph_encode_32(&p, req->r_t.pgid.seed);
|
2016-05-26 06:29:52 +08:00
|
|
|
ceph_encode_32(&p, -1); /* preferred */
|
2013-02-26 08:13:08 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
/* oid */
|
|
|
|
ceph_encode_32(&p, req->r_t.target_oid.name_len);
|
|
|
|
memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
|
|
|
|
p += req->r_t.target_oid.name_len;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
/* ops, can imply data */
|
|
|
|
ceph_encode_16(&p, req->r_num_ops);
|
|
|
|
for (i = 0; i < req->r_num_ops; i++) {
|
|
|
|
data_len += osd_req_encode_op(p, &req->r_ops[i]);
|
|
|
|
p += sizeof(struct ceph_osd_op);
|
|
|
|
}
|
libceph: change how "safe" callback is used
An osd request currently has two callbacks. They inform the
initiator of the request when we've received confirmation for the
target osd that a request was received, and when the osd indicates
all changes described by the request are durable.
The only time the second callback is used is in the ceph file system
for a synchronous write. There's a race that makes some handling of
this case unsafe. This patch addresses this problem. The error
handling for this callback is also kind of gross, and this patch
changes that as well.
In ceph_sync_write(), if a safe callback is requested we want to add
the request on the ceph inode's unsafe items list. Because items on
this list must have their tid set (by ceph_osd_start_request()), the
request added *after* the call to that function returns. The
problem with this is that there's a race between starting the
request and adding it to the unsafe items list; the request may
already be complete before ceph_sync_write() even begins to put it
on the list.
To address this, we change the way the "safe" callback is used.
Rather than just calling it when the request is "safe", we use it to
notify the initiator the bounds (start and end) of the period during
which the request is *unsafe*. So the initiator gets notified just
before the request gets sent to the osd (when it is "unsafe"), and
again when it's known the results are durable (it's no longer
unsafe). The first call will get made in __send_request(), just
before the request message gets sent to the messenger for the first
time. That function is only called by __send_queued(), which is
always called with the osd client's request mutex held.
We then have this callback function insert the request on the ceph
inode's unsafe list when we're told the request is unsafe. This
will avoid the race because this call will be made under protection
of the osd client's request mutex. It also nicely groups the setup
and cleanup of the state associated with managing unsafe requests.
The name of the "safe" callback field is changed to "unsafe" to
better reflect its new purpose. It has a Boolean "unsafe" parameter
to indicate whether the request is becoming unsafe or is now safe.
Because the "msg" parameter wasn't used, we drop that.
This resolves the original problem reportedin:
http://tracker.ceph.com/issues/4706
Reported-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2013-04-16 00:20:42 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
ceph_encode_64(&p, req->r_snapid); /* snapid */
|
|
|
|
if (req->r_snapc) {
|
|
|
|
ceph_encode_64(&p, req->r_snapc->seq);
|
|
|
|
ceph_encode_32(&p, req->r_snapc->num_snaps);
|
|
|
|
for (i = 0; i < req->r_snapc->num_snaps; i++)
|
|
|
|
ceph_encode_64(&p, req->r_snapc->snaps[i]);
|
|
|
|
} else {
|
|
|
|
ceph_encode_64(&p, 0); /* snap_seq */
|
|
|
|
ceph_encode_32(&p, 0); /* snaps len */
|
|
|
|
}
|
|
|
|
|
|
|
|
ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
|
|
|
|
|
|
|
|
BUG_ON(p > end);
|
|
|
|
msg->front.iov_len = p - msg->front.iov_base;
|
|
|
|
msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
|
|
|
|
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
|
|
|
|
msg->hdr.data_len = cpu_to_le32(data_len);
|
|
|
|
/*
|
|
|
|
* The header "data_off" is a hint to the receiver allowing it
|
|
|
|
* to align received data into its buffers such that there's no
|
|
|
|
* need to re-copy it before writing it to disk (direct I/O).
|
|
|
|
*/
|
|
|
|
msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
|
libceph: change how "safe" callback is used
An osd request currently has two callbacks. They inform the
initiator of the request when we've received confirmation for the
target osd that a request was received, and when the osd indicates
all changes described by the request are durable.
The only time the second callback is used is in the ceph file system
for a synchronous write. There's a race that makes some handling of
this case unsafe. This patch addresses this problem. The error
handling for this callback is also kind of gross, and this patch
changes that as well.
In ceph_sync_write(), if a safe callback is requested we want to add
the request on the ceph inode's unsafe items list. Because items on
this list must have their tid set (by ceph_osd_start_request()), the
request added *after* the call to that function returns. The
problem with this is that there's a race between starting the
request and adding it to the unsafe items list; the request may
already be complete before ceph_sync_write() even begins to put it
on the list.
To address this, we change the way the "safe" callback is used.
Rather than just calling it when the request is "safe", we use it to
notify the initiator the bounds (start and end) of the period during
which the request is *unsafe*. So the initiator gets notified just
before the request gets sent to the osd (when it is "unsafe"), and
again when it's known the results are durable (it's no longer
unsafe). The first call will get made in __send_request(), just
before the request message gets sent to the messenger for the first
time. That function is only called by __send_queued(), which is
always called with the osd client's request mutex held.
We then have this callback function insert the request on the ceph
inode's unsafe list when we're told the request is unsafe. This
will avoid the race because this call will be made under protection
of the osd client's request mutex. It also nicely groups the setup
and cleanup of the state associated with managing unsafe requests.
The name of the "safe" callback field is changed to "unsafe" to
better reflect its new purpose. It has a Boolean "unsafe" parameter
to indicate whether the request is becoming unsafe or is now safe.
Because the "msg" parameter wasn't used, we drop that.
This resolves the original problem reportedin:
http://tracker.ceph.com/issues/4706
Reported-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2013-04-16 00:20:42 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
|
|
|
|
req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
|
|
|
|
req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* @req has to be assigned a tid and registered.
|
|
|
|
*/
|
|
|
|
static void send_request(struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
struct ceph_osd *osd = req->r_osd;
|
|
|
|
|
|
|
|
WARN_ON(osd->o_osd != req->r_t.osd);
|
|
|
|
|
|
|
|
req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
|
|
|
|
if (req->r_attempts)
|
|
|
|
req->r_flags |= CEPH_OSD_FLAG_RETRY;
|
|
|
|
else
|
|
|
|
WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
|
|
|
|
|
|
|
|
encode_request(req, req->r_request);
|
|
|
|
|
|
|
|
dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
|
|
|
|
__func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
|
|
|
|
req->r_t.osd, req->r_flags, req->r_attempts);
|
|
|
|
|
|
|
|
req->r_t.paused = false;
|
|
|
|
req->r_stamp = jiffies;
|
|
|
|
req->r_attempts++;
|
|
|
|
|
|
|
|
req->r_sent = osd->o_incarnation;
|
|
|
|
req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
|
|
|
|
ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2011-01-18 12:34:08 +08:00
|
|
|
/*
|
|
|
|
* Send any requests in the queue (req_unsent).
|
|
|
|
*/
|
2013-02-16 01:42:29 +08:00
|
|
|
static void __send_queued(struct ceph_osd_client *osdc)
|
2011-01-18 12:34:08 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_request *req, *tmp;
|
|
|
|
|
2013-02-16 01:42:29 +08:00
|
|
|
dout("__send_queued\n");
|
2016-05-26 06:29:52 +08:00
|
|
|
list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
|
|
|
|
list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
|
|
|
|
send_request(req);
|
|
|
|
}
|
2011-01-18 12:34:08 +08:00
|
|
|
}
|
|
|
|
|
2014-02-01 01:33:39 +08:00
|
|
|
/*
|
|
|
|
* Caller should hold map_sem for read and request_mutex.
|
|
|
|
*/
|
|
|
|
static int __ceph_osdc_start_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req,
|
|
|
|
bool nofail)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
__register_request(osdc, req);
|
|
|
|
req->r_sent = 0;
|
|
|
|
req->r_got_reply = 0;
|
|
|
|
rc = __map_request(osdc, req, 0);
|
|
|
|
if (rc < 0) {
|
|
|
|
if (nofail) {
|
|
|
|
dout("osdc_start_request failed map, "
|
|
|
|
" will retry %lld\n", req->r_tid);
|
|
|
|
rc = 0;
|
|
|
|
} else {
|
|
|
|
__unregister_request(osdc, req);
|
|
|
|
}
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (req->r_osd == NULL) {
|
|
|
|
dout("send_request %p no up osds in pg\n", req);
|
|
|
|
ceph_monc_request_next_osdmap(&osdc->client->monc);
|
|
|
|
} else {
|
|
|
|
__send_queued(osdc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
static void __complete_request(struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
if (req->r_callback)
|
|
|
|
req->r_callback(req);
|
|
|
|
else
|
|
|
|
complete_all(&req->r_completion);
|
|
|
|
}
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* Timeout callback, called every N seconds when 1 or more osd
|
|
|
|
* requests has been active for more than N seconds. When this
|
|
|
|
* happens, we ping all OSDs with requests who have timed out to
|
|
|
|
* ensure any communications channel reset is detected. Reset the
|
|
|
|
* request timeouts another N seconds in the future as we go.
|
|
|
|
* Reschedule the timeout event another N seconds in future (unless
|
|
|
|
* there are no open requests).
|
|
|
|
*/
|
|
|
|
static void handle_timeout(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc =
|
|
|
|
container_of(work, struct ceph_osd_client, timeout_work.work);
|
2015-05-15 17:02:17 +08:00
|
|
|
struct ceph_options *opts = osdc->client->options;
|
2012-11-29 04:28:24 +08:00
|
|
|
struct ceph_osd_request *req;
|
2009-10-07 02:31:10 +08:00
|
|
|
struct ceph_osd *osd;
|
2010-02-27 07:32:31 +08:00
|
|
|
struct list_head slow_osds;
|
2009-10-07 02:31:10 +08:00
|
|
|
dout("timeout\n");
|
|
|
|
down_read(&osdc->map_sem);
|
|
|
|
|
|
|
|
ceph_monc_request_next_osdmap(&osdc->client->monc);
|
|
|
|
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
|
2010-02-27 07:32:31 +08:00
|
|
|
/*
|
|
|
|
* ping osds that are a bit slow. this ensures that if there
|
|
|
|
* is a break in the TCP connection we will notice, and reopen
|
|
|
|
* a connection with that osd (from the fault callback).
|
|
|
|
*/
|
|
|
|
INIT_LIST_HEAD(&slow_osds);
|
|
|
|
list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
|
2015-05-15 17:02:17 +08:00
|
|
|
if (time_before(jiffies,
|
|
|
|
req->r_stamp + opts->osd_keepalive_timeout))
|
2010-02-27 07:32:31 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
osd = req->r_osd;
|
|
|
|
BUG_ON(!osd);
|
|
|
|
dout(" tid %llu is slow, will send keepalive on osd%d\n",
|
2009-10-07 02:31:10 +08:00
|
|
|
req->r_tid, osd->o_osd);
|
2010-02-27 07:32:31 +08:00
|
|
|
list_move_tail(&osd->o_keepalive_item, &slow_osds);
|
|
|
|
}
|
|
|
|
while (!list_empty(&slow_osds)) {
|
|
|
|
osd = list_entry(slow_osds.next, struct ceph_osd,
|
|
|
|
o_keepalive_item);
|
|
|
|
list_del_init(&osd->o_keepalive_item);
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_con_keepalive(&osd->o_con);
|
|
|
|
}
|
|
|
|
|
2010-02-27 07:32:31 +08:00
|
|
|
__schedule_osd_timeout(osdc);
|
2013-02-16 01:42:29 +08:00
|
|
|
__send_queued(osdc);
|
2009-10-07 02:31:10 +08:00
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
up_read(&osdc->map_sem);
|
|
|
|
}
|
|
|
|
|
2010-02-04 03:00:26 +08:00
|
|
|
static void handle_osds_timeout(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc =
|
|
|
|
container_of(work, struct ceph_osd_client,
|
|
|
|
osds_timeout_work.work);
|
2015-05-15 17:02:17 +08:00
|
|
|
unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
|
2016-04-28 22:07:22 +08:00
|
|
|
struct ceph_osd *osd, *nosd;
|
2010-02-04 03:00:26 +08:00
|
|
|
|
2016-04-28 22:07:22 +08:00
|
|
|
dout("%s osdc %p\n", __func__, osdc);
|
2010-02-04 03:00:26 +08:00
|
|
|
down_read(&osdc->map_sem);
|
2016-04-28 22:07:22 +08:00
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
|
|
|
|
if (time_before(jiffies, osd->lru_ttl))
|
|
|
|
break;
|
|
|
|
|
|
|
|
remove_osd(osdc, osd);
|
|
|
|
}
|
2010-02-04 03:00:26 +08:00
|
|
|
|
2016-04-28 22:07:22 +08:00
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
up_read(&osdc->map_sem);
|
2010-02-04 03:00:26 +08:00
|
|
|
schedule_delayed_work(&osdc->osds_timeout_work,
|
|
|
|
round_jiffies_relative(delay));
|
|
|
|
}
|
|
|
|
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
static int ceph_oloc_decode(void **p, void *end,
|
|
|
|
struct ceph_object_locator *oloc)
|
|
|
|
{
|
|
|
|
u8 struct_v, struct_cv;
|
|
|
|
u32 len;
|
|
|
|
void *struct_end;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
|
|
|
|
struct_v = ceph_decode_8(p);
|
|
|
|
struct_cv = ceph_decode_8(p);
|
|
|
|
if (struct_v < 3) {
|
|
|
|
pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
|
|
|
|
struct_v, struct_cv);
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
if (struct_cv > 6) {
|
|
|
|
pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
|
|
|
|
struct_v, struct_cv);
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
len = ceph_decode_32(p);
|
|
|
|
ceph_decode_need(p, end, len, e_inval);
|
|
|
|
struct_end = *p + len;
|
|
|
|
|
|
|
|
oloc->pool = ceph_decode_64(p);
|
|
|
|
*p += 4; /* skip preferred */
|
|
|
|
|
|
|
|
len = ceph_decode_32(p);
|
|
|
|
if (len > 0) {
|
|
|
|
pr_warn("ceph_object_locator::key is set\n");
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (struct_v >= 5) {
|
|
|
|
len = ceph_decode_32(p);
|
|
|
|
if (len > 0) {
|
|
|
|
pr_warn("ceph_object_locator::nspace is set\n");
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (struct_v >= 6) {
|
|
|
|
s64 hash = ceph_decode_64(p);
|
|
|
|
if (hash != -1) {
|
|
|
|
pr_warn("ceph_object_locator::hash is set\n");
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* skip the rest */
|
|
|
|
*p = struct_end;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
e_inval:
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ceph_redirect_decode(void **p, void *end,
|
|
|
|
struct ceph_request_redirect *redir)
|
|
|
|
{
|
|
|
|
u8 struct_v, struct_cv;
|
|
|
|
u32 len;
|
|
|
|
void *struct_end;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
|
|
|
|
struct_v = ceph_decode_8(p);
|
|
|
|
struct_cv = ceph_decode_8(p);
|
|
|
|
if (struct_cv > 1) {
|
|
|
|
pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
|
|
|
|
struct_v, struct_cv);
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
len = ceph_decode_32(p);
|
|
|
|
ceph_decode_need(p, end, len, e_inval);
|
|
|
|
struct_end = *p + len;
|
|
|
|
|
|
|
|
ret = ceph_oloc_decode(p, end, &redir->oloc);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
len = ceph_decode_32(p);
|
|
|
|
if (len > 0) {
|
|
|
|
pr_warn("ceph_request_redirect::object_name is set\n");
|
|
|
|
goto e_inval;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = ceph_decode_32(p);
|
|
|
|
*p += len; /* skip osd_instructions */
|
|
|
|
|
|
|
|
/* skip the rest */
|
|
|
|
*p = struct_end;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
e_inval:
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
struct MOSDOpReply {
|
|
|
|
struct ceph_pg pgid;
|
|
|
|
u64 flags;
|
|
|
|
int result;
|
|
|
|
u32 epoch;
|
|
|
|
int num_ops;
|
|
|
|
u32 outdata_len[CEPH_OSD_MAX_OPS];
|
|
|
|
s32 rval[CEPH_OSD_MAX_OPS];
|
|
|
|
int retry_attempt;
|
|
|
|
struct ceph_eversion replay_version;
|
|
|
|
u64 user_version;
|
|
|
|
struct ceph_request_redirect redirect;
|
|
|
|
};
|
2011-06-04 00:37:09 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
2016-04-28 22:07:24 +08:00
|
|
|
void *p = msg->front.iov_base;
|
|
|
|
void *const end = p + msg->front.iov_len;
|
|
|
|
u16 version = le16_to_cpu(msg->hdr.version);
|
|
|
|
struct ceph_eversion bad_replay_version;
|
2016-02-03 22:25:48 +08:00
|
|
|
u8 decode_redir;
|
2016-04-28 22:07:24 +08:00
|
|
|
u32 len;
|
|
|
|
int ret;
|
|
|
|
int i;
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_decode_32_safe(&p, end, len, e_inval);
|
|
|
|
ceph_decode_need(&p, end, len, e_inval);
|
|
|
|
p += len; /* skip oid */
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ret = ceph_decode_pgid(&p, end, &m->pgid);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_decode_64_safe(&p, end, m->flags, e_inval);
|
|
|
|
ceph_decode_32_safe(&p, end, m->result, e_inval);
|
|
|
|
ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
|
|
|
|
memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
|
|
|
|
p += sizeof(bad_replay_version);
|
|
|
|
ceph_decode_32_safe(&p, end, m->epoch, e_inval);
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
|
|
|
|
if (m->num_ops > ARRAY_SIZE(m->outdata_len))
|
|
|
|
goto e_inval;
|
2013-02-26 08:11:12 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
|
|
|
|
e_inval);
|
|
|
|
for (i = 0; i < m->num_ops; i++) {
|
2013-02-26 08:11:12 +08:00
|
|
|
struct ceph_osd_op *op = p;
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
m->outdata_len[i] = le32_to_cpu(op->payload_len);
|
2013-02-26 08:11:12 +08:00
|
|
|
p += sizeof(*op);
|
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
|
|
|
|
for (i = 0; i < m->num_ops; i++)
|
|
|
|
ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
if (version >= 5) {
|
|
|
|
ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
|
|
|
|
memcpy(&m->replay_version, p, sizeof(m->replay_version));
|
|
|
|
p += sizeof(m->replay_version);
|
|
|
|
ceph_decode_64_safe(&p, end, m->user_version, e_inval);
|
|
|
|
} else {
|
|
|
|
m->replay_version = bad_replay_version; /* struct */
|
|
|
|
m->user_version = le64_to_cpu(m->replay_version.version);
|
|
|
|
}
|
2013-05-31 15:54:44 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
if (version >= 6) {
|
|
|
|
if (version >= 7)
|
|
|
|
ceph_decode_8_safe(&p, end, decode_redir, e_inval);
|
2016-02-03 22:25:48 +08:00
|
|
|
else
|
|
|
|
decode_redir = 1;
|
|
|
|
} else {
|
|
|
|
decode_redir = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (decode_redir) {
|
2016-04-28 22:07:24 +08:00
|
|
|
ret = ceph_redirect_decode(&p, end, &m->redirect);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
} else {
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_oloc_init(&m->redirect.oloc);
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
e_inval:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are done with @req if
|
|
|
|
* - @m is a safe reply, or
|
|
|
|
* - @m is an unsafe reply and we didn't want a safe one
|
|
|
|
*/
|
|
|
|
static bool done_request(const struct ceph_osd_request *req,
|
|
|
|
const struct MOSDOpReply *m)
|
|
|
|
{
|
|
|
|
return (m->result < 0 ||
|
|
|
|
(m->flags & CEPH_OSD_FLAG_ONDISK) ||
|
|
|
|
!(req->r_flags & CEPH_OSD_FLAG_ONDISK));
|
|
|
|
}
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
/*
|
|
|
|
* handle osd op reply. either call the callback if it is specified,
|
|
|
|
* or do the completion to wake up the waiting thread.
|
|
|
|
*
|
|
|
|
* ->r_unsafe_callback is set? yes no
|
|
|
|
*
|
|
|
|
* first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
|
|
|
|
* any or needed/got safe) r_safe_completion r_safe_completion
|
|
|
|
*
|
|
|
|
* first reply is unsafe r_unsafe_cb(true) (nothing)
|
|
|
|
*
|
|
|
|
* when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
|
|
|
|
* r_safe_completion r_safe_completion
|
|
|
|
*/
|
|
|
|
static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
|
|
|
|
{
|
|
|
|
struct ceph_osd_request *req;
|
|
|
|
struct MOSDOpReply m;
|
|
|
|
u64 tid = le64_to_cpu(msg->hdr.tid);
|
|
|
|
u32 data_len = 0;
|
|
|
|
bool already_acked;
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
dout("%s msg %p tid %llu\n", __func__, msg, tid);
|
|
|
|
|
|
|
|
down_read(&osdc->map_sem);
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
req = lookup_request(&osdc->requests, tid);
|
|
|
|
if (!req) {
|
|
|
|
dout("%s no tid %llu\n", __func__, tid);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
ceph_osdc_get_request(req);
|
|
|
|
|
|
|
|
ret = decode_MOSDOpReply(msg, &m);
|
|
|
|
if (ret) {
|
|
|
|
pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
|
|
|
|
req->r_tid, ret);
|
|
|
|
ceph_msg_dump(msg);
|
|
|
|
goto fail_request;
|
|
|
|
}
|
|
|
|
dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
|
|
|
|
__func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
|
|
|
|
m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
|
|
|
|
le64_to_cpu(m.replay_version.version), m.user_version);
|
|
|
|
|
|
|
|
if (m.retry_attempt >= 0) {
|
|
|
|
if (m.retry_attempt != req->r_attempts - 1) {
|
|
|
|
dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
|
|
|
|
req, req->r_tid, m.retry_attempt,
|
|
|
|
req->r_attempts - 1);
|
|
|
|
goto out_put;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
WARN_ON(1); /* MOSDOpReply v4 is assumed */
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ceph_oloc_empty(&m.redirect.oloc)) {
|
|
|
|
dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
|
|
|
|
m.redirect.oloc.pool);
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
__unregister_request(osdc, req);
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Start redirect requests with nofail=true. If
|
|
|
|
* mapping fails, request will end up on the notarget
|
|
|
|
* list, waiting for the new osdmap (which can take
|
|
|
|
* a while), even though the original request mapped
|
|
|
|
* successfully. In the future we might want to follow
|
|
|
|
* original request's nofail setting here.
|
|
|
|
*/
|
2016-04-28 22:07:24 +08:00
|
|
|
ret = __ceph_osdc_start_request(osdc, req, true);
|
|
|
|
BUG_ON(ret);
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
goto out_put;
|
libceph: follow redirect replies from osds
Follow redirect replies from osds, for details see ceph.git commit
fbbe3ad1220799b7bb00ea30fce581c5eadaf034.
v1 (current) version of redirect reply consists of oloc and oid, which
expands to pool, key, nspace, hash and oid. However, server-side code
that would populate anything other than pool doesn't exist yet, and
hence this commit adds support for pool redirects only. To make sure
that future server-side updates don't break us, we decode all fields
and, if any of key, nspace, hash or oid have a non-default value, error
out with "corrupt osd_op_reply ..." message.
Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2014-01-27 23:40:20 +08:00
|
|
|
}
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
if (m.num_ops != req->r_num_ops) {
|
|
|
|
pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
|
|
|
|
req->r_num_ops, req->r_tid);
|
|
|
|
goto fail_request;
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2016-04-28 22:07:24 +08:00
|
|
|
for (i = 0; i < req->r_num_ops; i++) {
|
|
|
|
dout(" req %p tid %llu op %d rval %d len %u\n", req,
|
|
|
|
req->r_tid, i, m.rval[i], m.outdata_len[i]);
|
|
|
|
req->r_ops[i].rval = m.rval[i];
|
|
|
|
req->r_ops[i].outdata_len = m.outdata_len[i];
|
|
|
|
data_len += m.outdata_len[i];
|
|
|
|
}
|
|
|
|
if (data_len != le32_to_cpu(msg->hdr.data_len)) {
|
|
|
|
pr_err("sum of lens %u != %u for tid %llu\n", data_len,
|
|
|
|
le32_to_cpu(msg->hdr.data_len), req->r_tid);
|
|
|
|
goto fail_request;
|
|
|
|
}
|
|
|
|
dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
|
|
|
|
req, req->r_tid, req->r_got_reply, m.result, data_len);
|
|
|
|
|
|
|
|
already_acked = req->r_got_reply;
|
|
|
|
if (!already_acked) {
|
|
|
|
req->r_result = m.result ?: data_len;
|
|
|
|
req->r_replay_version = m.replay_version; /* struct */
|
|
|
|
req->r_got_reply = true;
|
|
|
|
} else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
|
|
|
|
dout("req %p tid %llu dup ack\n", req, req->r_tid);
|
|
|
|
goto out_put;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (done_request(req, &m)) {
|
2009-10-07 02:31:10 +08:00
|
|
|
__unregister_request(osdc, req);
|
2016-04-28 22:07:24 +08:00
|
|
|
if (req->r_linger) {
|
|
|
|
WARN_ON(req->r_unsafe_callback);
|
|
|
|
__register_linger_request(osdc, req);
|
|
|
|
}
|
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
2014-02-03 19:56:33 +08:00
|
|
|
up_read(&osdc->map_sem);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
if (done_request(req, &m)) {
|
|
|
|
if (already_acked && req->r_unsafe_callback) {
|
|
|
|
dout("req %p tid %llu safe-cb\n", req, req->r_tid);
|
2013-06-24 14:41:27 +08:00
|
|
|
req->r_unsafe_callback(req, false);
|
2016-04-28 22:07:24 +08:00
|
|
|
} else {
|
|
|
|
dout("req %p tid %llu cb\n", req, req->r_tid);
|
|
|
|
__complete_request(req);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (req->r_unsafe_callback) {
|
|
|
|
dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
|
|
|
|
req->r_unsafe_callback(req, true);
|
|
|
|
} else {
|
|
|
|
WARN_ON(1);
|
|
|
|
}
|
2013-06-24 14:41:27 +08:00
|
|
|
}
|
2016-04-28 22:07:24 +08:00
|
|
|
if (m.flags & CEPH_OSD_FLAG_ONDISK)
|
|
|
|
complete_all(&req->r_safe_completion);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
return;
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
fail_request:
|
2013-11-27 22:28:14 +08:00
|
|
|
req->r_result = -EIO;
|
|
|
|
__unregister_request(osdc, req);
|
2016-04-28 22:07:24 +08:00
|
|
|
__complete_request(req);
|
|
|
|
complete_all(&req->r_safe_completion);
|
|
|
|
out_put:
|
2013-02-26 08:11:12 +08:00
|
|
|
ceph_osdc_put_request(req);
|
2016-04-28 22:07:24 +08:00
|
|
|
out_unlock:
|
2013-04-02 07:58:26 +08:00
|
|
|
mutex_unlock(&osdc->request_mutex);
|
2014-02-03 19:56:33 +08:00
|
|
|
up_read(&osdc->map_sem);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
2011-01-18 12:34:08 +08:00
|
|
|
static void reset_changed_osds(struct ceph_osd_client *osdc)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
struct rb_node *p, *n;
|
|
|
|
|
2015-02-18 00:37:15 +08:00
|
|
|
dout("%s %p\n", __func__, osdc);
|
2011-01-18 12:34:08 +08:00
|
|
|
for (p = rb_first(&osdc->osds); p; p = n) {
|
|
|
|
struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2011-01-18 12:34:08 +08:00
|
|
|
n = rb_next(p);
|
|
|
|
if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
|
|
|
|
memcmp(&osd->o_con.peer_addr,
|
|
|
|
ceph_osd_addr(osdc->osdmap,
|
|
|
|
osd->o_osd),
|
|
|
|
sizeof(struct ceph_entity_addr)) != 0)
|
|
|
|
__reset_osd(osdc, osd);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
2010-02-27 07:32:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2011-01-18 12:34:08 +08:00
|
|
|
* Requeue requests whose mapping to an OSD has changed. If requests map to
|
|
|
|
* no osd, request a new map.
|
2010-02-27 07:32:31 +08:00
|
|
|
*
|
2012-12-27 04:31:40 +08:00
|
|
|
* Caller should hold map_sem for read.
|
2010-02-27 07:32:31 +08:00
|
|
|
*/
|
2013-12-11 01:35:13 +08:00
|
|
|
static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
|
|
|
|
bool force_resend_writes)
|
2010-02-27 07:32:31 +08:00
|
|
|
{
|
2011-03-22 06:07:16 +08:00
|
|
|
struct ceph_osd_request *req, *nreq;
|
2011-01-18 12:34:08 +08:00
|
|
|
struct rb_node *p;
|
|
|
|
int needmap = 0;
|
|
|
|
int err;
|
2013-12-11 01:35:13 +08:00
|
|
|
bool force_resend_req;
|
2010-02-27 07:32:31 +08:00
|
|
|
|
2013-12-11 01:35:13 +08:00
|
|
|
dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
|
|
|
|
force_resend_writes ? " (force resend writes)" : "");
|
2010-02-27 07:32:31 +08:00
|
|
|
mutex_lock(&osdc->request_mutex);
|
2012-07-31 07:19:28 +08:00
|
|
|
for (p = rb_first(&osdc->requests); p; ) {
|
2011-01-18 12:34:08 +08:00
|
|
|
req = rb_entry(p, struct ceph_osd_request, r_node);
|
2012-07-31 07:19:28 +08:00
|
|
|
p = rb_next(p);
|
libceph: move linger requests sooner in kick_requests()
The kick_requests() function is called by ceph_osdc_handle_map()
when an osd map change has been indicated. Its purpose is to
re-queue any request whose target osd is different from what it
was when it was originally sent.
It is structured as two loops, one for incomplete but registered
requests, and a second for handling completed linger requests.
As a special case, in the first loop if a request marked to linger
has not yet completed, it is moved from the request list to the
linger list. This is as a quick and dirty way to have the second
loop handle sending the request along with all the other linger
requests.
Because of the way it's done now, however, this quick and dirty
solution can result in these incomplete linger requests never
getting re-sent as desired. The problem lies in the fact that
the second loop only arranges for a linger request to be sent
if it appears its target osd has changed. This is the proper
handling for *completed* linger requests (it avoids issuing
the same linger request twice to the same osd).
But although the linger requests added to the list in the first loop
may have been sent, they have not yet completed, so they need to be
re-sent regardless of whether their target osd has changed.
The first required fix is we need to avoid calling __map_request()
on any incomplete linger request. Otherwise the subsequent
__map_request() call in the second loop will find the target osd
has not changed and will therefore not re-send the request.
Second, we need to be sure that a sent but incomplete linger request
gets re-sent. If the target osd is the same with the new osd map as
it was when the request was originally sent, this won't happen.
This can be fixed through careful handling when we move these
requests from the request list to the linger list, by unregistering
the request *before* it is registered as a linger request. This
works because a side-effect of unregistering the request is to make
the request's r_osd pointer be NULL, and *that* will ensure the
second loop actually re-sends the linger request.
Processing of such a request is done at that point, so continue with
the next one once it's been moved.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-12-20 05:52:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For linger requests that have not yet been
|
|
|
|
* registered, move them to the linger list; they'll
|
|
|
|
* be sent to the osd in the loop below. Unregister
|
|
|
|
* the request before re-registering it as a linger
|
|
|
|
* request to ensure the __map_request() below
|
|
|
|
* will decide it needs to be sent.
|
|
|
|
*/
|
|
|
|
if (req->r_linger && list_empty(&req->r_linger_item)) {
|
|
|
|
dout("%p tid %llu restart on osd%d\n",
|
|
|
|
req, req->r_tid,
|
|
|
|
req->r_osd ? req->r_osd->o_osd : -1);
|
2013-05-23 09:54:25 +08:00
|
|
|
ceph_osdc_get_request(req);
|
libceph: move linger requests sooner in kick_requests()
The kick_requests() function is called by ceph_osdc_handle_map()
when an osd map change has been indicated. Its purpose is to
re-queue any request whose target osd is different from what it
was when it was originally sent.
It is structured as two loops, one for incomplete but registered
requests, and a second for handling completed linger requests.
As a special case, in the first loop if a request marked to linger
has not yet completed, it is moved from the request list to the
linger list. This is as a quick and dirty way to have the second
loop handle sending the request along with all the other linger
requests.
Because of the way it's done now, however, this quick and dirty
solution can result in these incomplete linger requests never
getting re-sent as desired. The problem lies in the fact that
the second loop only arranges for a linger request to be sent
if it appears its target osd has changed. This is the proper
handling for *completed* linger requests (it avoids issuing
the same linger request twice to the same osd).
But although the linger requests added to the list in the first loop
may have been sent, they have not yet completed, so they need to be
re-sent regardless of whether their target osd has changed.
The first required fix is we need to avoid calling __map_request()
on any incomplete linger request. Otherwise the subsequent
__map_request() call in the second loop will find the target osd
has not changed and will therefore not re-send the request.
Second, we need to be sure that a sent but incomplete linger request
gets re-sent. If the target osd is the same with the new osd map as
it was when the request was originally sent, this won't happen.
This can be fixed through careful handling when we move these
requests from the request list to the linger list, by unregistering
the request *before* it is registered as a linger request. This
works because a side-effect of unregistering the request is to make
the request's r_osd pointer be NULL, and *that* will ensure the
second loop actually re-sends the linger request.
Processing of such a request is done at that point, so continue with
the next one once it's been moved.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-12-20 05:52:36 +08:00
|
|
|
__unregister_request(osdc, req);
|
|
|
|
__register_linger_request(osdc, req);
|
2013-05-23 09:54:25 +08:00
|
|
|
ceph_osdc_put_request(req);
|
libceph: move linger requests sooner in kick_requests()
The kick_requests() function is called by ceph_osdc_handle_map()
when an osd map change has been indicated. Its purpose is to
re-queue any request whose target osd is different from what it
was when it was originally sent.
It is structured as two loops, one for incomplete but registered
requests, and a second for handling completed linger requests.
As a special case, in the first loop if a request marked to linger
has not yet completed, it is moved from the request list to the
linger list. This is as a quick and dirty way to have the second
loop handle sending the request along with all the other linger
requests.
Because of the way it's done now, however, this quick and dirty
solution can result in these incomplete linger requests never
getting re-sent as desired. The problem lies in the fact that
the second loop only arranges for a linger request to be sent
if it appears its target osd has changed. This is the proper
handling for *completed* linger requests (it avoids issuing
the same linger request twice to the same osd).
But although the linger requests added to the list in the first loop
may have been sent, they have not yet completed, so they need to be
re-sent regardless of whether their target osd has changed.
The first required fix is we need to avoid calling __map_request()
on any incomplete linger request. Otherwise the subsequent
__map_request() call in the second loop will find the target osd
has not changed and will therefore not re-send the request.
Second, we need to be sure that a sent but incomplete linger request
gets re-sent. If the target osd is the same with the new osd map as
it was when the request was originally sent, this won't happen.
This can be fixed through careful handling when we move these
requests from the request list to the linger list, by unregistering
the request *before* it is registered as a linger request. This
works because a side-effect of unregistering the request is to make
the request's r_osd pointer be NULL, and *that* will ensure the
second loop actually re-sends the linger request.
Processing of such a request is done at that point, so continue with
the next one once it's been moved.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-12-20 05:52:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-12-11 01:35:13 +08:00
|
|
|
force_resend_req = force_resend ||
|
|
|
|
(force_resend_writes &&
|
|
|
|
req->r_flags & CEPH_OSD_FLAG_WRITE);
|
|
|
|
err = __map_request(osdc, req, force_resend_req);
|
2011-01-18 12:34:08 +08:00
|
|
|
if (err < 0)
|
|
|
|
continue; /* error */
|
|
|
|
if (req->r_osd == NULL) {
|
|
|
|
dout("%p tid %llu maps to no osd\n", req, req->r_tid);
|
|
|
|
needmap++; /* request a newer map */
|
|
|
|
} else if (err > 0) {
|
2012-07-31 07:19:28 +08:00
|
|
|
if (!req->r_linger) {
|
|
|
|
dout("%p tid %llu requeued on osd%d\n", req,
|
|
|
|
req->r_tid,
|
|
|
|
req->r_osd ? req->r_osd->o_osd : -1);
|
2011-03-22 06:07:16 +08:00
|
|
|
req->r_flags |= CEPH_OSD_FLAG_RETRY;
|
2012-07-31 07:19:28 +08:00
|
|
|
}
|
|
|
|
}
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
list_for_each_entry_safe(req, nreq, &osdc->req_linger,
|
|
|
|
r_linger_item) {
|
|
|
|
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
|
|
|
|
|
2013-12-11 01:35:13 +08:00
|
|
|
err = __map_request(osdc, req,
|
|
|
|
force_resend || force_resend_writes);
|
libceph: move linger requests sooner in kick_requests()
The kick_requests() function is called by ceph_osdc_handle_map()
when an osd map change has been indicated. Its purpose is to
re-queue any request whose target osd is different from what it
was when it was originally sent.
It is structured as two loops, one for incomplete but registered
requests, and a second for handling completed linger requests.
As a special case, in the first loop if a request marked to linger
has not yet completed, it is moved from the request list to the
linger list. This is as a quick and dirty way to have the second
loop handle sending the request along with all the other linger
requests.
Because of the way it's done now, however, this quick and dirty
solution can result in these incomplete linger requests never
getting re-sent as desired. The problem lies in the fact that
the second loop only arranges for a linger request to be sent
if it appears its target osd has changed. This is the proper
handling for *completed* linger requests (it avoids issuing
the same linger request twice to the same osd).
But although the linger requests added to the list in the first loop
may have been sent, they have not yet completed, so they need to be
re-sent regardless of whether their target osd has changed.
The first required fix is we need to avoid calling __map_request()
on any incomplete linger request. Otherwise the subsequent
__map_request() call in the second loop will find the target osd
has not changed and will therefore not re-send the request.
Second, we need to be sure that a sent but incomplete linger request
gets re-sent. If the target osd is the same with the new osd map as
it was when the request was originally sent, this won't happen.
This can be fixed through careful handling when we move these
requests from the request list to the linger list, by unregistering
the request *before* it is registered as a linger request. This
works because a side-effect of unregistering the request is to make
the request's r_osd pointer be NULL, and *that* will ensure the
second loop actually re-sends the linger request.
Processing of such a request is done at that point, so continue with
the next one once it's been moved.
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
2012-12-20 05:52:36 +08:00
|
|
|
dout("__map_request returned %d\n", err);
|
2011-03-22 06:07:16 +08:00
|
|
|
if (err < 0)
|
|
|
|
continue; /* hrm! */
|
2015-05-11 22:53:10 +08:00
|
|
|
if (req->r_osd == NULL || err > 0) {
|
|
|
|
if (req->r_osd == NULL) {
|
|
|
|
dout("lingering %p tid %llu maps to no osd\n",
|
|
|
|
req, req->r_tid);
|
|
|
|
/*
|
|
|
|
* A homeless lingering request makes
|
|
|
|
* no sense, as it's job is to keep
|
|
|
|
* a particular OSD connection open.
|
|
|
|
* Request a newer map and kick the
|
|
|
|
* request, knowing that it won't be
|
|
|
|
* resent until we actually get a map
|
|
|
|
* that can tell us where to send it.
|
|
|
|
*/
|
|
|
|
needmap++;
|
|
|
|
}
|
2011-03-22 06:07:16 +08:00
|
|
|
|
2015-05-11 22:53:10 +08:00
|
|
|
dout("kicking lingering %p tid %llu osd%d\n", req,
|
|
|
|
req->r_tid, req->r_osd ? req->r_osd->o_osd : -1);
|
|
|
|
__register_request(osdc, req);
|
|
|
|
__unregister_linger_request(osdc, req);
|
|
|
|
}
|
2011-01-18 12:34:08 +08:00
|
|
|
}
|
2013-05-16 05:28:33 +08:00
|
|
|
reset_changed_osds(osdc);
|
2009-10-07 02:31:10 +08:00
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
|
|
|
|
if (needmap) {
|
|
|
|
dout("%d requests for down osds, need new map\n", needmap);
|
|
|
|
ceph_monc_request_next_osdmap(&osdc->client->monc);
|
|
|
|
}
|
2010-02-27 07:32:31 +08:00
|
|
|
}
|
2011-01-18 12:34:08 +08:00
|
|
|
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* Process updated osd map.
|
|
|
|
*
|
|
|
|
* The message contains any number of incremental and full maps, normally
|
|
|
|
* indicating some sort of topology change in the cluster. Kick requests
|
|
|
|
* off to different OSDs as needed.
|
|
|
|
*/
|
|
|
|
void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
|
|
|
|
{
|
|
|
|
void *p, *end, *next;
|
|
|
|
u32 nr_maps, maplen;
|
|
|
|
u32 epoch;
|
|
|
|
struct ceph_osdmap *newmap = NULL, *oldmap;
|
|
|
|
int err;
|
|
|
|
struct ceph_fsid fsid;
|
2013-12-11 01:35:13 +08:00
|
|
|
bool was_full;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
|
|
|
|
p = msg->front.iov_base;
|
|
|
|
end = p + msg->front.iov_len;
|
|
|
|
|
|
|
|
/* verify fsid */
|
|
|
|
ceph_decode_need(&p, end, sizeof(fsid), bad);
|
|
|
|
ceph_decode_copy(&p, &fsid, sizeof(fsid));
|
2009-11-19 08:50:41 +08:00
|
|
|
if (ceph_check_fsid(osdc->client, &fsid) < 0)
|
|
|
|
return;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
down_write(&osdc->map_sem);
|
|
|
|
|
2013-12-11 01:35:13 +08:00
|
|
|
was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/* incremental maps */
|
|
|
|
ceph_decode_32_safe(&p, end, nr_maps, bad);
|
|
|
|
dout(" %d inc maps\n", nr_maps);
|
|
|
|
while (nr_maps > 0) {
|
|
|
|
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
|
2009-10-15 00:59:09 +08:00
|
|
|
epoch = ceph_decode_32(&p);
|
|
|
|
maplen = ceph_decode_32(&p);
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_decode_need(&p, end, maplen, bad);
|
|
|
|
next = p + maplen;
|
|
|
|
if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
|
|
|
|
dout("applying incremental map %u len %d\n",
|
|
|
|
epoch, maplen);
|
|
|
|
newmap = osdmap_apply_incremental(&p, next,
|
2016-04-28 22:07:21 +08:00
|
|
|
osdc->osdmap);
|
2009-10-07 02:31:10 +08:00
|
|
|
if (IS_ERR(newmap)) {
|
|
|
|
err = PTR_ERR(newmap);
|
|
|
|
goto bad;
|
|
|
|
}
|
2009-12-22 06:49:37 +08:00
|
|
|
BUG_ON(!newmap);
|
2009-10-07 02:31:10 +08:00
|
|
|
if (newmap != osdc->osdmap) {
|
|
|
|
ceph_osdmap_destroy(osdc->osdmap);
|
|
|
|
osdc->osdmap = newmap;
|
|
|
|
}
|
2013-12-11 01:35:13 +08:00
|
|
|
was_full = was_full ||
|
|
|
|
ceph_osdmap_flag(osdc->osdmap,
|
|
|
|
CEPH_OSDMAP_FULL);
|
|
|
|
kick_requests(osdc, 0, was_full);
|
2009-10-07 02:31:10 +08:00
|
|
|
} else {
|
|
|
|
dout("ignoring incremental map %u len %d\n",
|
|
|
|
epoch, maplen);
|
|
|
|
}
|
|
|
|
p = next;
|
|
|
|
nr_maps--;
|
|
|
|
}
|
|
|
|
if (newmap)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
/* full maps */
|
|
|
|
ceph_decode_32_safe(&p, end, nr_maps, bad);
|
|
|
|
dout(" %d full maps\n", nr_maps);
|
|
|
|
while (nr_maps) {
|
|
|
|
ceph_decode_need(&p, end, 2*sizeof(u32), bad);
|
2009-10-15 00:59:09 +08:00
|
|
|
epoch = ceph_decode_32(&p);
|
|
|
|
maplen = ceph_decode_32(&p);
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_decode_need(&p, end, maplen, bad);
|
|
|
|
if (nr_maps > 1) {
|
|
|
|
dout("skipping non-latest full map %u len %d\n",
|
|
|
|
epoch, maplen);
|
|
|
|
} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
|
|
|
|
dout("skipping full map %u len %d, "
|
|
|
|
"older than our %u\n", epoch, maplen,
|
|
|
|
osdc->osdmap->epoch);
|
|
|
|
} else {
|
2011-10-15 04:33:55 +08:00
|
|
|
int skipped_map = 0;
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
dout("taking full map %u len %d\n", epoch, maplen);
|
2014-03-13 22:36:13 +08:00
|
|
|
newmap = ceph_osdmap_decode(&p, p+maplen);
|
2009-10-07 02:31:10 +08:00
|
|
|
if (IS_ERR(newmap)) {
|
|
|
|
err = PTR_ERR(newmap);
|
|
|
|
goto bad;
|
|
|
|
}
|
2009-12-22 06:49:37 +08:00
|
|
|
BUG_ON(!newmap);
|
2009-10-07 02:31:10 +08:00
|
|
|
oldmap = osdc->osdmap;
|
|
|
|
osdc->osdmap = newmap;
|
2011-10-15 04:33:55 +08:00
|
|
|
if (oldmap) {
|
|
|
|
if (oldmap->epoch + 1 < newmap->epoch)
|
|
|
|
skipped_map = 1;
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_osdmap_destroy(oldmap);
|
2011-10-15 04:33:55 +08:00
|
|
|
}
|
2013-12-11 01:35:13 +08:00
|
|
|
was_full = was_full ||
|
|
|
|
ceph_osdmap_flag(osdc->osdmap,
|
|
|
|
CEPH_OSDMAP_FULL);
|
|
|
|
kick_requests(osdc, skipped_map, was_full);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
p += maplen;
|
|
|
|
nr_maps--;
|
|
|
|
}
|
|
|
|
|
2013-08-15 13:52:48 +08:00
|
|
|
if (!osdc->osdmap)
|
|
|
|
goto bad;
|
2009-10-07 02:31:10 +08:00
|
|
|
done:
|
|
|
|
downgrade_write(&osdc->map_sem);
|
2016-01-19 23:19:06 +08:00
|
|
|
ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
|
|
|
|
osdc->osdmap->epoch);
|
2011-05-13 00:29:18 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* subscribe to subsequent osdmap updates if full to ensure
|
|
|
|
* we find out when we are no longer full and stop returning
|
|
|
|
* ENOSPC.
|
|
|
|
*/
|
libceph: block I/O when PAUSE or FULL osd map flags are set
The PAUSEWR and PAUSERD flags are meant to stop the cluster from
processing writes and reads, respectively. The FULL flag is set when
the cluster determines that it is out of space, and will no longer
process writes. PAUSEWR and PAUSERD are purely client-side settings
already implemented in userspace clients. The osd does nothing special
with these flags.
When the FULL flag is set, however, the osd responds to all writes
with -ENOSPC. For cephfs, this makes sense, but for rbd the block
layer translates this into EIO. If a cluster goes from full to
non-full quickly, a filesystem on top of rbd will not behave well,
since some writes succeed while others get EIO.
Fix this by blocking any writes when the FULL flag is set in the osd
client. This is the same strategy used by userspace, so apply it by
default. A follow-on patch makes this configurable.
__map_request() is called to re-target osd requests in case the
available osds changed. Add a paused field to a ceph_osd_request, and
set it whenever an appropriate osd map flag is set. Avoid queueing
paused requests in __map_request(), but force them to be resent if
they become unpaused.
Also subscribe to the next osd map from the monitor if any of these
flags are set, so paused requests can be unblocked as soon as
possible.
Fixes: http://tracker.ceph.com/issues/6079
Reviewed-by: Sage Weil <sage@inktank.com>
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
2013-12-03 11:11:48 +08:00
|
|
|
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
|
|
|
|
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
|
|
|
|
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
|
2011-05-13 00:29:18 +08:00
|
|
|
ceph_monc_request_next_osdmap(&osdc->client->monc);
|
|
|
|
|
2013-02-16 01:42:29 +08:00
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
__send_queued(osdc);
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
2009-10-07 02:31:10 +08:00
|
|
|
up_read(&osdc->map_sem);
|
2010-07-28 04:11:08 +08:00
|
|
|
wake_up_all(&osdc->client->auth_wq);
|
2009-10-07 02:31:10 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
bad:
|
|
|
|
pr_err("osdc handle_map corrupt msg\n");
|
2009-12-15 07:13:47 +08:00
|
|
|
ceph_msg_dump(msg);
|
2009-10-07 02:31:10 +08:00
|
|
|
up_write(&osdc->map_sem);
|
|
|
|
}
|
|
|
|
|
2011-03-22 06:07:16 +08:00
|
|
|
/*
|
|
|
|
* watch/notify callback event infrastructure
|
|
|
|
*
|
|
|
|
* These callbacks are used both for watch and notify operations.
|
|
|
|
*/
|
|
|
|
static void __release_event(struct kref *kref)
|
|
|
|
{
|
|
|
|
struct ceph_osd_event *event =
|
|
|
|
container_of(kref, struct ceph_osd_event, kref);
|
|
|
|
|
|
|
|
dout("__release_event %p\n", event);
|
|
|
|
kfree(event);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void get_event(struct ceph_osd_event *event)
|
|
|
|
{
|
|
|
|
kref_get(&event->kref);
|
|
|
|
}
|
|
|
|
|
|
|
|
void ceph_osdc_put_event(struct ceph_osd_event *event)
|
|
|
|
{
|
|
|
|
kref_put(&event->kref, __release_event);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_put_event);
|
|
|
|
|
|
|
|
static void __insert_event(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_event *new)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &osdc->event_tree.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct ceph_osd_event *event = NULL;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
event = rb_entry(parent, struct ceph_osd_event, node);
|
|
|
|
if (new->cookie < event->cookie)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else if (new->cookie > event->cookie)
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
else
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&new->node, parent, p);
|
|
|
|
rb_insert_color(&new->node, &osdc->event_tree);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
|
|
|
|
u64 cookie)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &osdc->event_tree.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct ceph_osd_event *event = NULL;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
event = rb_entry(parent, struct ceph_osd_event, node);
|
|
|
|
if (cookie < event->cookie)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else if (cookie > event->cookie)
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
else
|
|
|
|
return event;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __remove_event(struct ceph_osd_event *event)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc = event->osdc;
|
|
|
|
|
|
|
|
if (!RB_EMPTY_NODE(&event->node)) {
|
|
|
|
dout("__remove_event removed %p\n", event);
|
|
|
|
rb_erase(&event->node, &osdc->event_tree);
|
|
|
|
ceph_osdc_put_event(event);
|
|
|
|
} else {
|
|
|
|
dout("__remove_event didn't remove %p\n", event);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int ceph_osdc_create_event(struct ceph_osd_client *osdc,
|
|
|
|
void (*event_cb)(u64, u64, u8, void *),
|
2013-02-16 01:42:30 +08:00
|
|
|
void *data, struct ceph_osd_event **pevent)
|
2011-03-22 06:07:16 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_event *event;
|
|
|
|
|
|
|
|
event = kmalloc(sizeof(*event), GFP_NOIO);
|
|
|
|
if (!event)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
dout("create_event %p\n", event);
|
|
|
|
event->cb = event_cb;
|
2013-02-16 01:42:30 +08:00
|
|
|
event->one_shot = 0;
|
2011-03-22 06:07:16 +08:00
|
|
|
event->data = data;
|
|
|
|
event->osdc = osdc;
|
|
|
|
INIT_LIST_HEAD(&event->osd_node);
|
2012-12-18 02:23:48 +08:00
|
|
|
RB_CLEAR_NODE(&event->node);
|
2011-03-22 06:07:16 +08:00
|
|
|
kref_init(&event->kref); /* one ref for us */
|
|
|
|
kref_get(&event->kref); /* one ref for the caller */
|
|
|
|
|
|
|
|
spin_lock(&osdc->event_lock);
|
|
|
|
event->cookie = ++osdc->event_count;
|
|
|
|
__insert_event(osdc, event);
|
|
|
|
spin_unlock(&osdc->event_lock);
|
|
|
|
|
|
|
|
*pevent = event;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_create_event);
|
|
|
|
|
|
|
|
void ceph_osdc_cancel_event(struct ceph_osd_event *event)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc = event->osdc;
|
|
|
|
|
|
|
|
dout("cancel_event %p\n", event);
|
|
|
|
spin_lock(&osdc->event_lock);
|
|
|
|
__remove_event(event);
|
|
|
|
spin_unlock(&osdc->event_lock);
|
|
|
|
ceph_osdc_put_event(event); /* caller's */
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_cancel_event);
|
|
|
|
|
|
|
|
|
|
|
|
static void do_event_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct ceph_osd_event_work *event_work =
|
|
|
|
container_of(work, struct ceph_osd_event_work, work);
|
|
|
|
struct ceph_osd_event *event = event_work->event;
|
|
|
|
u64 ver = event_work->ver;
|
|
|
|
u64 notify_id = event_work->notify_id;
|
|
|
|
u8 opcode = event_work->opcode;
|
|
|
|
|
|
|
|
dout("do_event_work completing %p\n", event);
|
|
|
|
event->cb(ver, notify_id, opcode, event->data);
|
|
|
|
dout("do_event_work completed %p\n", event);
|
|
|
|
ceph_osdc_put_event(event);
|
|
|
|
kfree(event_work);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process osd watch notifications
|
|
|
|
*/
|
2013-02-16 01:42:30 +08:00
|
|
|
static void handle_watch_notify(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_msg *msg)
|
2011-03-22 06:07:16 +08:00
|
|
|
{
|
|
|
|
void *p, *end;
|
|
|
|
u8 proto_ver;
|
|
|
|
u64 cookie, ver, notify_id;
|
|
|
|
u8 opcode;
|
|
|
|
struct ceph_osd_event *event;
|
|
|
|
struct ceph_osd_event_work *event_work;
|
|
|
|
|
|
|
|
p = msg->front.iov_base;
|
|
|
|
end = p + msg->front.iov_len;
|
|
|
|
|
|
|
|
ceph_decode_8_safe(&p, end, proto_ver, bad);
|
|
|
|
ceph_decode_8_safe(&p, end, opcode, bad);
|
|
|
|
ceph_decode_64_safe(&p, end, cookie, bad);
|
|
|
|
ceph_decode_64_safe(&p, end, ver, bad);
|
|
|
|
ceph_decode_64_safe(&p, end, notify_id, bad);
|
|
|
|
|
|
|
|
spin_lock(&osdc->event_lock);
|
|
|
|
event = __find_event(osdc, cookie);
|
|
|
|
if (event) {
|
2013-02-16 01:42:30 +08:00
|
|
|
BUG_ON(event->one_shot);
|
2011-03-22 06:07:16 +08:00
|
|
|
get_event(event);
|
|
|
|
}
|
|
|
|
spin_unlock(&osdc->event_lock);
|
|
|
|
dout("handle_watch_notify cookie %lld ver %lld event %p\n",
|
|
|
|
cookie, ver, event);
|
|
|
|
if (event) {
|
|
|
|
event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
|
|
|
|
if (!event_work) {
|
2014-09-11 16:18:53 +08:00
|
|
|
pr_err("couldn't allocate event_work\n");
|
|
|
|
ceph_osdc_put_event(event);
|
|
|
|
return;
|
2011-03-22 06:07:16 +08:00
|
|
|
}
|
2011-03-27 02:29:34 +08:00
|
|
|
INIT_WORK(&event_work->work, do_event_work);
|
2011-03-22 06:07:16 +08:00
|
|
|
event_work->event = event;
|
|
|
|
event_work->ver = ver;
|
|
|
|
event_work->notify_id = notify_id;
|
|
|
|
event_work->opcode = opcode;
|
|
|
|
|
2014-09-11 16:18:53 +08:00
|
|
|
queue_work(osdc->notify_wq, &event_work->work);
|
|
|
|
}
|
2011-03-22 06:07:16 +08:00
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
bad:
|
|
|
|
pr_err("osdc handle_watch_notify corrupt msg\n");
|
|
|
|
}
|
|
|
|
|
2013-03-05 08:29:06 +08:00
|
|
|
/*
|
|
|
|
* Register request, send initial attempt.
|
|
|
|
*/
|
|
|
|
int ceph_osdc_start_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req,
|
|
|
|
bool nofail)
|
|
|
|
{
|
2014-02-01 01:33:39 +08:00
|
|
|
int rc;
|
2013-03-05 08:29:06 +08:00
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
down_read(&osdc->map_sem);
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
2014-02-01 01:33:39 +08:00
|
|
|
|
|
|
|
rc = __ceph_osdc_start_request(osdc, req, nofail);
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
up_read(&osdc->map_sem);
|
2014-02-01 01:33:39 +08:00
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
return rc;
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_start_request);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2014-06-19 15:38:13 +08:00
|
|
|
/*
|
|
|
|
* Unregister a registered request. The request is not completed (i.e.
|
|
|
|
* no callbacks or wakeups) - higher layers are supposed to know what
|
|
|
|
* they are canceling.
|
|
|
|
*/
|
|
|
|
void ceph_osdc_cancel_request(struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
struct ceph_osd_client *osdc = req->r_osdc;
|
|
|
|
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
if (req->r_linger)
|
|
|
|
__unregister_linger_request(osdc, req);
|
|
|
|
__unregister_request(osdc, req);
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
|
|
|
|
dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_cancel_request);
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* wait for a request to complete
|
|
|
|
*/
|
|
|
|
int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_osd_request *req)
|
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2014-06-19 15:38:13 +08:00
|
|
|
dout("%s %p tid %llu\n", __func__, req, req->r_tid);
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
rc = wait_for_completion_interruptible(&req->r_completion);
|
|
|
|
if (rc < 0) {
|
2014-06-19 15:38:13 +08:00
|
|
|
dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid);
|
|
|
|
ceph_osdc_cancel_request(req);
|
2016-04-28 22:07:24 +08:00
|
|
|
|
|
|
|
/* kludge - need to to wake ceph_osdc_sync() */
|
|
|
|
complete_all(&req->r_safe_completion);
|
2009-10-07 02:31:10 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2014-06-19 15:38:13 +08:00
|
|
|
dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid,
|
|
|
|
req->r_result);
|
2009-10-07 02:31:10 +08:00
|
|
|
return req->r_result;
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_wait_request);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* sync - wait for all in-flight requests to flush. avoid starvation.
|
|
|
|
*/
|
|
|
|
void ceph_osdc_sync(struct ceph_osd_client *osdc)
|
|
|
|
{
|
|
|
|
struct ceph_osd_request *req;
|
|
|
|
u64 last_tid, next_tid = 0;
|
|
|
|
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
last_tid = osdc->last_tid;
|
|
|
|
while (1) {
|
|
|
|
req = __lookup_request_ge(osdc, next_tid);
|
|
|
|
if (!req)
|
|
|
|
break;
|
|
|
|
if (req->r_tid > last_tid)
|
|
|
|
break;
|
|
|
|
|
|
|
|
next_tid = req->r_tid + 1;
|
|
|
|
if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ceph_osdc_get_request(req);
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
dout("sync waiting on tid %llu (last is %llu)\n",
|
|
|
|
req->r_tid, last_tid);
|
|
|
|
wait_for_completion(&req->r_safe_completion);
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
}
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
dout("sync done (thru tid %llu)\n", last_tid);
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_sync);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-08-29 12:43:09 +08:00
|
|
|
/*
|
|
|
|
* Call all pending notify callbacks - for use after a watch is
|
|
|
|
* unregistered, to make sure no more callbacks for it will be invoked
|
|
|
|
*/
|
2014-06-11 11:30:13 +08:00
|
|
|
void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
|
2013-08-29 12:43:09 +08:00
|
|
|
{
|
|
|
|
flush_workqueue(osdc->notify_wq);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_flush_notifies);
|
|
|
|
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* init, shutdown
|
|
|
|
*/
|
|
|
|
int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
|
|
|
dout("init\n");
|
|
|
|
osdc->client = client;
|
|
|
|
osdc->osdmap = NULL;
|
|
|
|
init_rwsem(&osdc->map_sem);
|
|
|
|
mutex_init(&osdc->request_mutex);
|
|
|
|
osdc->last_tid = 0;
|
|
|
|
osdc->osds = RB_ROOT;
|
2010-02-04 03:00:26 +08:00
|
|
|
INIT_LIST_HEAD(&osdc->osd_lru);
|
2009-10-07 02:31:10 +08:00
|
|
|
osdc->requests = RB_ROOT;
|
2010-02-27 07:32:31 +08:00
|
|
|
INIT_LIST_HEAD(&osdc->req_lru);
|
2011-01-18 12:34:08 +08:00
|
|
|
INIT_LIST_HEAD(&osdc->req_unsent);
|
|
|
|
INIT_LIST_HEAD(&osdc->req_notarget);
|
2011-03-22 06:07:16 +08:00
|
|
|
INIT_LIST_HEAD(&osdc->req_linger);
|
2009-10-07 02:31:10 +08:00
|
|
|
osdc->num_requests = 0;
|
|
|
|
INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
|
2010-02-04 03:00:26 +08:00
|
|
|
INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
|
2011-03-22 06:07:16 +08:00
|
|
|
spin_lock_init(&osdc->event_lock);
|
|
|
|
osdc->event_tree = RB_ROOT;
|
|
|
|
osdc->event_count = 0;
|
2010-02-04 03:00:26 +08:00
|
|
|
|
2009-11-19 06:52:18 +08:00
|
|
|
err = -ENOMEM;
|
2016-02-10 00:25:31 +08:00
|
|
|
osdc->req_mempool = mempool_create_slab_pool(10,
|
|
|
|
ceph_osd_request_cache);
|
2009-10-07 02:31:10 +08:00
|
|
|
if (!osdc->req_mempool)
|
2009-11-19 06:52:18 +08:00
|
|
|
goto out;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2012-07-10 05:22:34 +08:00
|
|
|
err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
|
2016-04-28 00:32:56 +08:00
|
|
|
PAGE_SIZE, 10, true, "osd_op");
|
2009-10-07 02:31:10 +08:00
|
|
|
if (err < 0)
|
2009-11-19 06:52:18 +08:00
|
|
|
goto out_mempool;
|
2012-07-10 05:22:34 +08:00
|
|
|
err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
|
2016-04-28 00:32:56 +08:00
|
|
|
PAGE_SIZE, 10, true, "osd_op_reply");
|
2010-03-02 05:02:00 +08:00
|
|
|
if (err < 0)
|
|
|
|
goto out_msgpool;
|
2011-03-22 06:07:16 +08:00
|
|
|
|
2013-08-15 13:58:59 +08:00
|
|
|
err = -ENOMEM;
|
2011-03-22 06:07:16 +08:00
|
|
|
osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
|
2013-08-15 13:58:59 +08:00
|
|
|
if (!osdc->notify_wq)
|
2014-01-31 23:49:22 +08:00
|
|
|
goto out_msgpool_reply;
|
|
|
|
|
2016-04-28 22:07:24 +08:00
|
|
|
schedule_delayed_work(&osdc->osds_timeout_work,
|
|
|
|
round_jiffies_relative(osdc->client->options->osd_idle_ttl));
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
return 0;
|
2009-11-19 06:52:18 +08:00
|
|
|
|
2014-01-31 23:49:22 +08:00
|
|
|
out_msgpool_reply:
|
|
|
|
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
|
2010-03-02 05:02:00 +08:00
|
|
|
out_msgpool:
|
|
|
|
ceph_msgpool_destroy(&osdc->msgpool_op);
|
2009-11-19 06:52:18 +08:00
|
|
|
out_mempool:
|
|
|
|
mempool_destroy(osdc->req_mempool);
|
|
|
|
out:
|
|
|
|
return err;
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void ceph_osdc_stop(struct ceph_osd_client *osdc)
|
|
|
|
{
|
2011-03-22 06:07:16 +08:00
|
|
|
flush_workqueue(osdc->notify_wq);
|
|
|
|
destroy_workqueue(osdc->notify_wq);
|
2009-10-07 02:31:10 +08:00
|
|
|
cancel_delayed_work_sync(&osdc->timeout_work);
|
2010-02-04 03:00:26 +08:00
|
|
|
cancel_delayed_work_sync(&osdc->osds_timeout_work);
|
2016-04-28 22:07:22 +08:00
|
|
|
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
|
|
|
while (!RB_EMPTY_ROOT(&osdc->osds)) {
|
|
|
|
struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
|
|
|
|
struct ceph_osd, o_node);
|
|
|
|
remove_osd(osdc, osd);
|
|
|
|
}
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
if (osdc->osdmap) {
|
|
|
|
ceph_osdmap_destroy(osdc->osdmap);
|
|
|
|
osdc->osdmap = NULL;
|
|
|
|
}
|
|
|
|
mempool_destroy(osdc->req_mempool);
|
|
|
|
ceph_msgpool_destroy(&osdc->msgpool_op);
|
2010-03-02 05:02:00 +08:00
|
|
|
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read some contiguous pages. If we cross a stripe boundary, shorten
|
|
|
|
* *plen. Return number of bytes read, or error.
|
|
|
|
*/
|
|
|
|
int ceph_osdc_readpages(struct ceph_osd_client *osdc,
|
|
|
|
struct ceph_vino vino, struct ceph_file_layout *layout,
|
|
|
|
u64 off, u64 *plen,
|
|
|
|
u32 truncate_seq, u64 truncate_size,
|
2010-11-10 04:43:12 +08:00
|
|
|
struct page **pages, int num_pages, int page_align)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_request *req;
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
|
|
|
|
vino.snap, off, *plen);
|
2014-11-13 14:40:37 +08:00
|
|
|
req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
|
2009-10-07 02:31:10 +08:00
|
|
|
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
|
2013-03-15 03:09:05 +08:00
|
|
|
NULL, truncate_seq, truncate_size,
|
libceph: don't assign page info in ceph_osdc_new_request()
Currently ceph_osdc_new_request() assigns an osd request's
r_num_pages and r_alignment fields. The only thing it does
after that is call ceph_osdc_build_request(), and that doesn't
need those fields to be assigned.
Move the assignment of those fields out of ceph_osdc_new_request()
and into its caller. As a result, the page_align parameter is no
longer used, so get rid of it.
Note that in ceph_sync_write(), the value for req->r_num_pages had
already been calculated earlier (as num_pages, and fortunately
it was computed the same way). So don't bother recomputing it,
but because it's not needed earlier, move that calculation after the
call to ceph_osdc_new_request(). Hold off making the assignment to
r_alignment, doing it instead r_pages and r_num_pages are
getting set.
Similarly, in start_read(), nr_pages already holds the number of
pages in the array (and is calculated the same way), so there's no
need to recompute it. Move the assignment of the page alignment
down with the others there as well.
This and the next few patches are preparation work for:
http://tracker.ceph.com/issues/4127
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-02 08:00:15 +08:00
|
|
|
false);
|
2012-09-25 12:01:02 +08:00
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/* it may be a short read due to an object boundary */
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_req_op_extent_osd_data_pages(req, 0,
|
2013-04-05 14:27:12 +08:00
|
|
|
pages, *plen, page_align, false, false);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-03-08 05:38:25 +08:00
|
|
|
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
|
2013-04-03 14:28:57 +08:00
|
|
|
off, *plen, *plen, page_align);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
rc = ceph_osdc_start_request(osdc, req, false);
|
|
|
|
if (!rc)
|
|
|
|
rc = ceph_osdc_wait_request(osdc, req);
|
|
|
|
|
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
dout("readpages result %d\n", rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_readpages);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* do a synchronous write on N pages
|
|
|
|
*/
|
|
|
|
int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
|
|
|
|
struct ceph_file_layout *layout,
|
|
|
|
struct ceph_snap_context *snapc,
|
|
|
|
u64 off, u64 len,
|
|
|
|
u32 truncate_seq, u64 truncate_size,
|
|
|
|
struct timespec *mtime,
|
2013-02-16 01:42:29 +08:00
|
|
|
struct page **pages, int num_pages)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd_request *req;
|
|
|
|
int rc = 0;
|
2010-11-10 04:43:12 +08:00
|
|
|
int page_align = off & ~PAGE_MASK;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2014-11-13 14:40:37 +08:00
|
|
|
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
|
2009-10-07 02:31:10 +08:00
|
|
|
CEPH_OSD_OP_WRITE,
|
2013-02-16 01:42:29 +08:00
|
|
|
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
|
2013-03-15 03:09:05 +08:00
|
|
|
snapc, truncate_seq, truncate_size,
|
libceph: don't assign page info in ceph_osdc_new_request()
Currently ceph_osdc_new_request() assigns an osd request's
r_num_pages and r_alignment fields. The only thing it does
after that is call ceph_osdc_build_request(), and that doesn't
need those fields to be assigned.
Move the assignment of those fields out of ceph_osdc_new_request()
and into its caller. As a result, the page_align parameter is no
longer used, so get rid of it.
Note that in ceph_sync_write(), the value for req->r_num_pages had
already been calculated earlier (as num_pages, and fortunately
it was computed the same way). So don't bother recomputing it,
but because it's not needed earlier, move that calculation after the
call to ceph_osdc_new_request(). Hold off making the assignment to
r_alignment, doing it instead r_pages and r_num_pages are
getting set.
Similarly, in start_read(), nr_pages already holds the number of
pages in the array (and is calculated the same way), so there's no
need to recompute it. Move the assignment of the page alignment
down with the others there as well.
This and the next few patches are preparation work for:
http://tracker.ceph.com/issues/4127
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
2013-03-02 08:00:15 +08:00
|
|
|
true);
|
2012-09-25 12:01:02 +08:00
|
|
|
if (IS_ERR(req))
|
|
|
|
return PTR_ERR(req);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
/* it may be a short write due to an object boundary */
|
2013-04-16 03:50:36 +08:00
|
|
|
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
|
2013-04-03 14:28:57 +08:00
|
|
|
false, false);
|
|
|
|
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2016-05-26 06:29:52 +08:00
|
|
|
req->r_mtime = *mtime;
|
2013-02-16 01:42:29 +08:00
|
|
|
rc = ceph_osdc_start_request(osdc, req, true);
|
2009-10-07 02:31:10 +08:00
|
|
|
if (!rc)
|
|
|
|
rc = ceph_osdc_wait_request(osdc, req);
|
|
|
|
|
|
|
|
ceph_osdc_put_request(req);
|
|
|
|
if (rc == 0)
|
|
|
|
rc = len;
|
|
|
|
dout("writepages result %d\n", rc);
|
|
|
|
return rc;
|
|
|
|
}
|
2010-04-07 06:14:15 +08:00
|
|
|
EXPORT_SYMBOL(ceph_osdc_writepages);
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2013-05-02 01:43:04 +08:00
|
|
|
int ceph_osdc_setup(void)
|
|
|
|
{
|
2016-02-10 00:50:15 +08:00
|
|
|
size_t size = sizeof(struct ceph_osd_request) +
|
|
|
|
CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
|
|
|
|
|
2013-05-02 01:43:04 +08:00
|
|
|
BUG_ON(ceph_osd_request_cache);
|
2016-02-10 00:50:15 +08:00
|
|
|
ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
|
|
|
|
0, 0, NULL);
|
2013-05-02 01:43:04 +08:00
|
|
|
|
|
|
|
return ceph_osd_request_cache ? 0 : -ENOMEM;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_setup);
|
|
|
|
|
|
|
|
void ceph_osdc_cleanup(void)
|
|
|
|
{
|
|
|
|
BUG_ON(!ceph_osd_request_cache);
|
|
|
|
kmem_cache_destroy(ceph_osd_request_cache);
|
|
|
|
ceph_osd_request_cache = NULL;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(ceph_osdc_cleanup);
|
|
|
|
|
2009-10-07 02:31:10 +08:00
|
|
|
/*
|
|
|
|
* handle incoming message
|
|
|
|
*/
|
|
|
|
static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
|
|
|
|
{
|
|
|
|
struct ceph_osd *osd = con->private;
|
2009-11-21 23:53:16 +08:00
|
|
|
struct ceph_osd_client *osdc;
|
2009-10-07 02:31:10 +08:00
|
|
|
int type = le16_to_cpu(msg->hdr.type);
|
|
|
|
|
|
|
|
if (!osd)
|
2010-06-14 01:27:53 +08:00
|
|
|
goto out;
|
2009-11-21 23:53:16 +08:00
|
|
|
osdc = osd->o_osdc;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case CEPH_MSG_OSD_MAP:
|
|
|
|
ceph_osdc_handle_map(osdc, msg);
|
|
|
|
break;
|
|
|
|
case CEPH_MSG_OSD_OPREPLY:
|
2015-10-18 16:25:38 +08:00
|
|
|
handle_reply(osdc, msg);
|
2009-10-07 02:31:10 +08:00
|
|
|
break;
|
2011-03-22 06:07:16 +08:00
|
|
|
case CEPH_MSG_WATCH_NOTIFY:
|
|
|
|
handle_watch_notify(osdc, msg);
|
|
|
|
break;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
|
|
|
default:
|
|
|
|
pr_err("received unknown message type %d %s\n", type,
|
|
|
|
ceph_msg_type_name(type));
|
|
|
|
}
|
2010-06-14 01:27:53 +08:00
|
|
|
out:
|
2009-10-07 02:31:10 +08:00
|
|
|
ceph_msg_put(msg);
|
|
|
|
}
|
|
|
|
|
2010-02-20 13:43:23 +08:00
|
|
|
/*
|
libceph: check data_len in ->alloc_msg()
Only ->alloc_msg() should check data_len of the incoming message
against the preallocated ceph_msg, doing it in the messenger is not
right. The contract is that either ->alloc_msg() returns a ceph_msg
which will fit all of the portions of the incoming message, or it
returns NULL and possibly sets skip, signaling whether NULL is due to
an -ENOMEM. ->alloc_msg() should be the only place where we make the
skip/no-skip decision.
I stumbled upon this while looking at con/osd ref counting. Right now,
if we get a non-extent message with a larger data portion than we are
prepared for, ->alloc_msg() returns a ceph_msg, and then, when we skip
it in the messenger, we don't put the con/osd ref acquired in
ceph_con_in_msg_alloc() (which is normally put in process_message()),
so this also fixes a memory leak.
An existing BUG_ON in ceph_msg_data_cursor_init() ensures we don't
corrupt random memory should a buggy ->alloc_msg() return an unfit
ceph_msg.
While at it, I changed the "unknown tid" dout() to a pr_warn() to make
sure all skips are seen and unified format strings.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
2015-09-02 16:37:09 +08:00
|
|
|
* Lookup and return message for incoming reply. Don't try to do
|
|
|
|
* anything about a larger than preallocated data portion of the
|
|
|
|
* message at the moment - for now, just skip the message.
|
2010-02-20 13:43:23 +08:00
|
|
|
*/
|
|
|
|
static struct ceph_msg *get_reply(struct ceph_connection *con,
|
2010-01-09 05:58:34 +08:00
|
|
|
struct ceph_msg_header *hdr,
|
|
|
|
int *skip)
|
2009-10-07 02:31:10 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd *osd = con->private;
|
|
|
|
struct ceph_osd_client *osdc = osd->o_osdc;
|
2010-01-09 05:58:34 +08:00
|
|
|
struct ceph_msg *m;
|
2010-01-12 06:47:13 +08:00
|
|
|
struct ceph_osd_request *req;
|
2014-01-10 02:08:21 +08:00
|
|
|
int front_len = le32_to_cpu(hdr->front_len);
|
2010-02-20 13:43:23 +08:00
|
|
|
int data_len = le32_to_cpu(hdr->data_len);
|
2010-01-12 06:47:13 +08:00
|
|
|
u64 tid;
|
2009-10-07 02:31:10 +08:00
|
|
|
|
2010-01-12 06:47:13 +08:00
|
|
|
tid = le64_to_cpu(hdr->tid);
|
|
|
|
mutex_lock(&osdc->request_mutex);
|
2016-04-28 22:07:22 +08:00
|
|
|
req = lookup_request(&osdc->requests, tid);
|
2010-01-12 06:47:13 +08:00
|
|
|
if (!req) {
|
2016-02-19 18:38:57 +08:00
|
|
|
dout("%s osd%d tid %llu unknown, skipping\n", __func__,
|
|
|
|
osd->o_osd, tid);
|
2010-01-12 06:47:13 +08:00
|
|
|
m = NULL;
|
libceph: check data_len in ->alloc_msg()
Only ->alloc_msg() should check data_len of the incoming message
against the preallocated ceph_msg, doing it in the messenger is not
right. The contract is that either ->alloc_msg() returns a ceph_msg
which will fit all of the portions of the incoming message, or it
returns NULL and possibly sets skip, signaling whether NULL is due to
an -ENOMEM. ->alloc_msg() should be the only place where we make the
skip/no-skip decision.
I stumbled upon this while looking at con/osd ref counting. Right now,
if we get a non-extent message with a larger data portion than we are
prepared for, ->alloc_msg() returns a ceph_msg, and then, when we skip
it in the messenger, we don't put the con/osd ref acquired in
ceph_con_in_msg_alloc() (which is normally put in process_message()),
so this also fixes a memory leak.
An existing BUG_ON in ceph_msg_data_cursor_init() ensures we don't
corrupt random memory should a buggy ->alloc_msg() return an unfit
ceph_msg.
While at it, I changed the "unknown tid" dout() to a pr_warn() to make
sure all skips are seen and unified format strings.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
2015-09-02 16:37:09 +08:00
|
|
|
*skip = 1;
|
2010-01-12 06:47:13 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2010-03-02 05:02:00 +08:00
|
|
|
|
2013-04-02 05:12:14 +08:00
|
|
|
ceph_msg_revoke_incoming(req->r_reply);
|
2010-01-12 06:47:13 +08:00
|
|
|
|
2014-01-10 02:08:21 +08:00
|
|
|
if (front_len > req->r_reply->front_alloc_len) {
|
libceph: check data_len in ->alloc_msg()
Only ->alloc_msg() should check data_len of the incoming message
against the preallocated ceph_msg, doing it in the messenger is not
right. The contract is that either ->alloc_msg() returns a ceph_msg
which will fit all of the portions of the incoming message, or it
returns NULL and possibly sets skip, signaling whether NULL is due to
an -ENOMEM. ->alloc_msg() should be the only place where we make the
skip/no-skip decision.
I stumbled upon this while looking at con/osd ref counting. Right now,
if we get a non-extent message with a larger data portion than we are
prepared for, ->alloc_msg() returns a ceph_msg, and then, when we skip
it in the messenger, we don't put the con/osd ref acquired in
ceph_con_in_msg_alloc() (which is normally put in process_message()),
so this also fixes a memory leak.
An existing BUG_ON in ceph_msg_data_cursor_init() ensures we don't
corrupt random memory should a buggy ->alloc_msg() return an unfit
ceph_msg.
While at it, I changed the "unknown tid" dout() to a pr_warn() to make
sure all skips are seen and unified format strings.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
2015-09-02 16:37:09 +08:00
|
|
|
pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
|
|
|
|
__func__, osd->o_osd, req->r_tid, front_len,
|
|
|
|
req->r_reply->front_alloc_len);
|
2014-01-10 02:08:21 +08:00
|
|
|
m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
|
|
|
|
false);
|
2010-04-02 07:06:19 +08:00
|
|
|
if (!m)
|
2010-03-02 05:02:00 +08:00
|
|
|
goto out;
|
|
|
|
ceph_msg_put(req->r_reply);
|
|
|
|
req->r_reply = m;
|
|
|
|
}
|
2013-02-15 02:16:43 +08:00
|
|
|
|
libceph: check data_len in ->alloc_msg()
Only ->alloc_msg() should check data_len of the incoming message
against the preallocated ceph_msg, doing it in the messenger is not
right. The contract is that either ->alloc_msg() returns a ceph_msg
which will fit all of the portions of the incoming message, or it
returns NULL and possibly sets skip, signaling whether NULL is due to
an -ENOMEM. ->alloc_msg() should be the only place where we make the
skip/no-skip decision.
I stumbled upon this while looking at con/osd ref counting. Right now,
if we get a non-extent message with a larger data portion than we are
prepared for, ->alloc_msg() returns a ceph_msg, and then, when we skip
it in the messenger, we don't put the con/osd ref acquired in
ceph_con_in_msg_alloc() (which is normally put in process_message()),
so this also fixes a memory leak.
An existing BUG_ON in ceph_msg_data_cursor_init() ensures we don't
corrupt random memory should a buggy ->alloc_msg() return an unfit
ceph_msg.
While at it, I changed the "unknown tid" dout() to a pr_warn() to make
sure all skips are seen and unified format strings.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
2015-09-02 16:37:09 +08:00
|
|
|
if (data_len > req->r_reply->data_length) {
|
|
|
|
pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
|
|
|
|
__func__, osd->o_osd, req->r_tid, data_len,
|
|
|
|
req->r_reply->data_length);
|
|
|
|
m = NULL;
|
|
|
|
*skip = 1;
|
|
|
|
goto out;
|
2010-01-12 06:47:13 +08:00
|
|
|
}
|
libceph: check data_len in ->alloc_msg()
Only ->alloc_msg() should check data_len of the incoming message
against the preallocated ceph_msg, doing it in the messenger is not
right. The contract is that either ->alloc_msg() returns a ceph_msg
which will fit all of the portions of the incoming message, or it
returns NULL and possibly sets skip, signaling whether NULL is due to
an -ENOMEM. ->alloc_msg() should be the only place where we make the
skip/no-skip decision.
I stumbled upon this while looking at con/osd ref counting. Right now,
if we get a non-extent message with a larger data portion than we are
prepared for, ->alloc_msg() returns a ceph_msg, and then, when we skip
it in the messenger, we don't put the con/osd ref acquired in
ceph_con_in_msg_alloc() (which is normally put in process_message()),
so this also fixes a memory leak.
An existing BUG_ON in ceph_msg_data_cursor_init() ensures we don't
corrupt random memory should a buggy ->alloc_msg() return an unfit
ceph_msg.
While at it, I changed the "unknown tid" dout() to a pr_warn() to make
sure all skips are seen and unified format strings.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
2015-09-02 16:37:09 +08:00
|
|
|
|
|
|
|
m = ceph_msg_get(req->r_reply);
|
2010-03-02 05:02:00 +08:00
|
|
|
dout("get_reply tid %lld %p\n", tid, m);
|
2010-01-12 06:47:13 +08:00
|
|
|
|
|
|
|
out:
|
|
|
|
mutex_unlock(&osdc->request_mutex);
|
2010-01-09 05:58:34 +08:00
|
|
|
return m;
|
2010-02-20 13:43:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct ceph_msg *alloc_msg(struct ceph_connection *con,
|
|
|
|
struct ceph_msg_header *hdr,
|
|
|
|
int *skip)
|
|
|
|
{
|
|
|
|
struct ceph_osd *osd = con->private;
|
|
|
|
int type = le16_to_cpu(hdr->type);
|
|
|
|
int front = le32_to_cpu(hdr->front_len);
|
|
|
|
|
2012-06-05 03:43:32 +08:00
|
|
|
*skip = 0;
|
2010-02-20 13:43:23 +08:00
|
|
|
switch (type) {
|
|
|
|
case CEPH_MSG_OSD_MAP:
|
2011-03-22 06:07:16 +08:00
|
|
|
case CEPH_MSG_WATCH_NOTIFY:
|
2011-08-10 06:03:46 +08:00
|
|
|
return ceph_msg_new(type, front, GFP_NOFS, false);
|
2010-02-20 13:43:23 +08:00
|
|
|
case CEPH_MSG_OSD_OPREPLY:
|
|
|
|
return get_reply(con, hdr, skip);
|
|
|
|
default:
|
|
|
|
pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
|
|
|
|
osd->o_osd);
|
|
|
|
*skip = 1;
|
|
|
|
return NULL;
|
|
|
|
}
|
2009-10-07 02:31:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrappers to refcount containing ceph_osd struct
|
|
|
|
*/
|
|
|
|
static struct ceph_connection *get_osd_con(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_osd *osd = con->private;
|
|
|
|
if (get_osd(osd))
|
|
|
|
return con;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void put_osd_con(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_osd *osd = con->private;
|
|
|
|
put_osd(osd);
|
|
|
|
}
|
|
|
|
|
2009-11-19 08:19:57 +08:00
|
|
|
/*
|
|
|
|
* authentication
|
|
|
|
*/
|
2012-05-17 04:16:39 +08:00
|
|
|
/*
|
|
|
|
* Note: returned pointer is the address of a structure that's
|
|
|
|
* managed separately. Caller must *not* attempt to free it.
|
|
|
|
*/
|
|
|
|
static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
|
2012-05-17 04:16:39 +08:00
|
|
|
int *proto, int force_new)
|
2009-11-19 08:19:57 +08:00
|
|
|
{
|
|
|
|
struct ceph_osd *o = con->private;
|
|
|
|
struct ceph_osd_client *osdc = o->o_osdc;
|
|
|
|
struct ceph_auth_client *ac = osdc->client->monc.auth;
|
2012-05-17 04:16:39 +08:00
|
|
|
struct ceph_auth_handshake *auth = &o->o_auth;
|
2009-11-19 08:19:57 +08:00
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
if (force_new && auth->authorizer) {
|
2016-04-12 01:34:49 +08:00
|
|
|
ceph_auth_destroy_authorizer(auth->authorizer);
|
2012-05-17 04:16:39 +08:00
|
|
|
auth->authorizer = NULL;
|
|
|
|
}
|
2013-03-26 01:26:14 +08:00
|
|
|
if (!auth->authorizer) {
|
|
|
|
int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
|
|
|
|
auth);
|
2009-11-19 08:19:57 +08:00
|
|
|
if (ret)
|
2012-05-17 04:16:39 +08:00
|
|
|
return ERR_PTR(ret);
|
2013-03-26 01:26:14 +08:00
|
|
|
} else {
|
|
|
|
int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
|
2013-03-26 01:26:01 +08:00
|
|
|
auth);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
2009-11-19 08:19:57 +08:00
|
|
|
}
|
|
|
|
*proto = ac->protocol;
|
2012-05-17 04:16:39 +08:00
|
|
|
|
2012-05-17 04:16:39 +08:00
|
|
|
return auth;
|
2009-11-19 08:19:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int verify_authorizer_reply(struct ceph_connection *con, int len)
|
|
|
|
{
|
|
|
|
struct ceph_osd *o = con->private;
|
|
|
|
struct ceph_osd_client *osdc = o->o_osdc;
|
|
|
|
struct ceph_auth_client *ac = osdc->client->monc.auth;
|
|
|
|
|
2013-03-26 01:26:14 +08:00
|
|
|
return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
|
2009-11-19 08:19:57 +08:00
|
|
|
}
|
|
|
|
|
2010-02-03 08:21:06 +08:00
|
|
|
static int invalidate_authorizer(struct ceph_connection *con)
|
|
|
|
{
|
|
|
|
struct ceph_osd *o = con->private;
|
|
|
|
struct ceph_osd_client *osdc = o->o_osdc;
|
|
|
|
struct ceph_auth_client *ac = osdc->client->monc.auth;
|
|
|
|
|
2013-03-26 01:26:14 +08:00
|
|
|
ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
|
2010-02-03 08:21:06 +08:00
|
|
|
return ceph_monc_validate_auth(&osdc->client->monc);
|
|
|
|
}
|
2009-11-19 08:19:57 +08:00
|
|
|
|
2015-10-27 05:23:56 +08:00
|
|
|
static int osd_sign_message(struct ceph_msg *msg)
|
2014-11-04 16:33:37 +08:00
|
|
|
{
|
2015-10-27 05:23:56 +08:00
|
|
|
struct ceph_osd *o = msg->con->private;
|
2014-11-04 16:33:37 +08:00
|
|
|
struct ceph_auth_handshake *auth = &o->o_auth;
|
2015-10-27 05:23:56 +08:00
|
|
|
|
2014-11-04 16:33:37 +08:00
|
|
|
return ceph_auth_sign_message(auth, msg);
|
|
|
|
}
|
|
|
|
|
2015-10-27 05:23:56 +08:00
|
|
|
static int osd_check_message_signature(struct ceph_msg *msg)
|
2014-11-04 16:33:37 +08:00
|
|
|
{
|
2015-10-27 05:23:56 +08:00
|
|
|
struct ceph_osd *o = msg->con->private;
|
2014-11-04 16:33:37 +08:00
|
|
|
struct ceph_auth_handshake *auth = &o->o_auth;
|
2015-10-27 05:23:56 +08:00
|
|
|
|
2014-11-04 16:33:37 +08:00
|
|
|
return ceph_auth_check_message_signature(auth, msg);
|
|
|
|
}
|
|
|
|
|
2010-05-20 16:40:19 +08:00
|
|
|
static const struct ceph_connection_operations osd_con_ops = {
|
2009-10-07 02:31:10 +08:00
|
|
|
.get = get_osd_con,
|
|
|
|
.put = put_osd_con,
|
|
|
|
.dispatch = dispatch,
|
2009-11-19 08:19:57 +08:00
|
|
|
.get_authorizer = get_authorizer,
|
|
|
|
.verify_authorizer_reply = verify_authorizer_reply,
|
2010-02-03 08:21:06 +08:00
|
|
|
.invalidate_authorizer = invalidate_authorizer,
|
2009-10-07 02:31:10 +08:00
|
|
|
.alloc_msg = alloc_msg,
|
2015-10-27 05:23:56 +08:00
|
|
|
.sign_message = osd_sign_message,
|
|
|
|
.check_message_signature = osd_check_message_signature,
|
2009-10-10 01:29:18 +08:00
|
|
|
.fault = osd_reset,
|
2009-10-07 02:31:10 +08:00
|
|
|
};
|