mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-30 07:34:12 +08:00
We have:
- a patch that removes crush_workspace_mutex (myself). CRUSH computations are no longer serialized and can run in parallel. - a couple new filesystem client metrics for "ceph fs top" command (Xiubo Li) - a fix for a very old messenger bug that affected the filesystem, marked for stable (myself) - assorted fixups and cleanups throughout the codebase from Jeff and others. -----BEGIN PGP SIGNATURE----- iQFHBAABCAAxFiEEydHwtzie9C7TfviiSn/eOAIR84sFAl+QN8cTHGlkcnlvbW92 QGdtYWlsLmNvbQAKCRBKf944AhHzi4iwCACLwUvO0ONTzUUb2D8ftfyXjo/jIod5 eO7RHfCHOP2A83GQpdYdktVDSVeOUIOiPhxAxo1dL4GieI2/saXrnoevam7ogZkA OmR4drdtRVUqF/aATrtjiDMg2ge0dnx5gfjMxSP/FiPPpXOdrtxng/7tv8yo+03q AlMqg/YcxO06t1M1qh9SEyfzjcHyPnJU2i0ienngxnGxQ7QiMOR6anF1LNhtN803 4fTBX2tLqDNAa+x5yF1kKSn9OmFNhc0oUsqef+Ck0Vw1LC0/SuxzE2J904iehAwy /HzJCer0zcp+eZhn3HhoHmp0UZpOC1dzMxa3CHyRJFrxCSTEhY8iqPiJ =wGr7 -----END PGP SIGNATURE----- Merge tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client Pull ceph updates from Ilya Dryomov: - a patch that removes crush_workspace_mutex (myself). CRUSH computations are no longer serialized and can run in parallel. - a couple new filesystem client metrics for "ceph fs top" command (Xiubo Li) - a fix for a very old messenger bug that affected the filesystem, marked for stable (myself) - assorted fixups and cleanups throughout the codebase from Jeff and others. * tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client: (27 commits) libceph: clear con->out_msg on Policy::stateful_server faults libceph: format ceph_entity_addr nonces as unsigned libceph: fix ENTITY_NAME format suggestion libceph: move a dout in queue_con_delay() ceph: comment cleanups and clarifications ceph: break up send_cap_msg ceph: drop separate mdsc argument from __send_cap ceph: promote to unsigned long long before shifting ceph: don't SetPageError on readpage errors ceph: mark ceph_fmt_xattr() as printf-like for better type checking ceph: fold ceph_update_writeable_page into ceph_write_begin ceph: fold ceph_sync_writepages into writepage_nounlock ceph: fold ceph_sync_readpages into ceph_readpage ceph: don't call ceph_update_writeable_page from page_mkwrite ceph: break out writeback of incompatible snap context to separate function ceph: add a note explaining session reject error string libceph: switch to the new "osd blocklist add" command libceph, rbd, ceph: "blacklist" -> "blocklist" ceph: have ceph_writepages_start call pagevec_lookup_range_tag ceph: use kill_anon_super helper ...
This commit is contained in:
commit
ed7cfefe44
@ -163,14 +163,14 @@ Mount Options
|
|||||||
to the default VFS implementation if this option is used.
|
to the default VFS implementation if this option is used.
|
||||||
|
|
||||||
recover_session=<no|clean>
|
recover_session=<no|clean>
|
||||||
Set auto reconnect mode in the case where the client is blacklisted. The
|
Set auto reconnect mode in the case where the client is blocklisted. The
|
||||||
available modes are "no" and "clean". The default is "no".
|
available modes are "no" and "clean". The default is "no".
|
||||||
|
|
||||||
* no: never attempt to reconnect when client detects that it has been
|
* no: never attempt to reconnect when client detects that it has been
|
||||||
blacklisted. Operations will generally fail after being blacklisted.
|
blocklisted. Operations will generally fail after being blocklisted.
|
||||||
|
|
||||||
* clean: client reconnects to the ceph cluster automatically when it
|
* clean: client reconnects to the ceph cluster automatically when it
|
||||||
detects that it has been blacklisted. During reconnect, client drops
|
detects that it has been blocklisted. During reconnect, client drops
|
||||||
dirty data/metadata, invalidates page caches and writable file handles.
|
dirty data/metadata, invalidates page caches and writable file handles.
|
||||||
After reconnect, file locks become stale because the MDS loses track
|
After reconnect, file locks become stale because the MDS loses track
|
||||||
of them. If an inode contains any stale file locks, read/write on the
|
of them. If an inode contains any stale file locks, read/write on the
|
||||||
|
@ -4010,10 +4010,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
|
|||||||
rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
|
rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
|
||||||
ENTITY_NAME(lockers[0].id.name));
|
ENTITY_NAME(lockers[0].id.name));
|
||||||
|
|
||||||
ret = ceph_monc_blacklist_add(&client->monc,
|
ret = ceph_monc_blocklist_add(&client->monc,
|
||||||
&lockers[0].info.addr);
|
&lockers[0].info.addr);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
|
rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
|
||||||
ENTITY_NAME(lockers[0].id.name), ret);
|
ENTITY_NAME(lockers[0].id.name), ret);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@ -4077,7 +4077,7 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
|
|||||||
ret = rbd_try_lock(rbd_dev);
|
ret = rbd_try_lock(rbd_dev);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
rbd_warn(rbd_dev, "failed to lock header: %d", ret);
|
rbd_warn(rbd_dev, "failed to lock header: %d", ret);
|
||||||
if (ret == -EBLACKLISTED)
|
if (ret == -EBLOCKLISTED)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
ret = 1; /* request lock anyway */
|
ret = 1; /* request lock anyway */
|
||||||
@ -4613,7 +4613,7 @@ static void rbd_reregister_watch(struct work_struct *work)
|
|||||||
ret = __rbd_register_watch(rbd_dev);
|
ret = __rbd_register_watch(rbd_dev);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
|
rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
|
||||||
if (ret != -EBLACKLISTED && ret != -ENOENT) {
|
if (ret != -EBLOCKLISTED && ret != -ENOENT) {
|
||||||
queue_delayed_work(rbd_dev->task_wq,
|
queue_delayed_work(rbd_dev->task_wq,
|
||||||
&rbd_dev->watch_dwork,
|
&rbd_dev->watch_dwork,
|
||||||
RBD_RETRY_DELAY);
|
RBD_RETRY_DELAY);
|
||||||
|
416
fs/ceph/addr.c
416
fs/ceph/addr.c
@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g)
|
|||||||
return !PagePrivate(page);
|
return !PagePrivate(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* read a single page, without unlocking it. */
|
||||||
* Read some contiguous pages. If we cross a stripe boundary, shorten
|
|
||||||
* *plen. Return number of bytes read, or error.
|
|
||||||
*/
|
|
||||||
static int ceph_sync_readpages(struct ceph_fs_client *fsc,
|
|
||||||
struct ceph_vino vino,
|
|
||||||
struct ceph_file_layout *layout,
|
|
||||||
u64 off, u64 *plen,
|
|
||||||
u32 truncate_seq, u64 truncate_size,
|
|
||||||
struct page **pages, int num_pages,
|
|
||||||
int page_align)
|
|
||||||
{
|
|
||||||
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
|
||||||
struct ceph_osd_request *req;
|
|
||||||
int rc = 0;
|
|
||||||
|
|
||||||
dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
|
|
||||||
vino.snap, off, *plen);
|
|
||||||
req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
|
|
||||||
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
|
|
||||||
NULL, truncate_seq, truncate_size,
|
|
||||||
false);
|
|
||||||
if (IS_ERR(req))
|
|
||||||
return PTR_ERR(req);
|
|
||||||
|
|
||||||
/* it may be a short read due to an object boundary */
|
|
||||||
osd_req_op_extent_osd_data_pages(req, 0,
|
|
||||||
pages, *plen, page_align, false, false);
|
|
||||||
|
|
||||||
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
|
|
||||||
off, *plen, *plen, page_align);
|
|
||||||
|
|
||||||
rc = ceph_osdc_start_request(osdc, req, false);
|
|
||||||
if (!rc)
|
|
||||||
rc = ceph_osdc_wait_request(osdc, req);
|
|
||||||
|
|
||||||
ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
|
|
||||||
req->r_end_latency, rc);
|
|
||||||
|
|
||||||
ceph_osdc_put_request(req);
|
|
||||||
dout("readpages result %d\n", rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* read a single page, without unlocking it.
|
|
||||||
*/
|
|
||||||
static int ceph_do_readpage(struct file *filp, struct page *page)
|
static int ceph_do_readpage(struct file *filp, struct page *page)
|
||||||
{
|
{
|
||||||
struct inode *inode = file_inode(filp);
|
struct inode *inode = file_inode(filp);
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||||
|
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
||||||
|
struct ceph_osd_request *req;
|
||||||
|
struct ceph_vino vino = ceph_vino(inode);
|
||||||
int err = 0;
|
int err = 0;
|
||||||
u64 off = page_offset(page);
|
u64 off = page_offset(page);
|
||||||
u64 len = PAGE_SIZE;
|
u64 len = PAGE_SIZE;
|
||||||
@ -260,19 +217,33 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
|
|||||||
if (err == 0)
|
if (err == 0)
|
||||||
return -EINPROGRESS;
|
return -EINPROGRESS;
|
||||||
|
|
||||||
dout("readpage inode %p file %p page %p index %lu\n",
|
dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n",
|
||||||
inode, filp, page, page->index);
|
vino.ino, vino.snap, filp, off, len, page, page->index);
|
||||||
err = ceph_sync_readpages(fsc, ceph_vino(inode),
|
req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1,
|
||||||
&ci->i_layout, off, &len,
|
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL,
|
||||||
ci->i_truncate_seq, ci->i_truncate_size,
|
ci->i_truncate_seq, ci->i_truncate_size,
|
||||||
&page, 1, 0);
|
false);
|
||||||
|
if (IS_ERR(req))
|
||||||
|
return PTR_ERR(req);
|
||||||
|
|
||||||
|
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
|
||||||
|
|
||||||
|
err = ceph_osdc_start_request(osdc, req, false);
|
||||||
|
if (!err)
|
||||||
|
err = ceph_osdc_wait_request(osdc, req);
|
||||||
|
|
||||||
|
ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
|
||||||
|
req->r_end_latency, err);
|
||||||
|
|
||||||
|
ceph_osdc_put_request(req);
|
||||||
|
dout("readpage result %d\n", err);
|
||||||
|
|
||||||
if (err == -ENOENT)
|
if (err == -ENOENT)
|
||||||
err = 0;
|
err = 0;
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
SetPageError(page);
|
|
||||||
ceph_fscache_readpage_cancel(inode, page);
|
ceph_fscache_readpage_cancel(inode, page);
|
||||||
if (err == -EBLACKLISTED)
|
if (err == -EBLOCKLISTED)
|
||||||
fsc->blacklisted = true;
|
fsc->blocklisted = true;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
if (err < PAGE_SIZE)
|
if (err < PAGE_SIZE)
|
||||||
@ -312,8 +283,8 @@ static void finish_read(struct ceph_osd_request *req)
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
|
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
|
||||||
if (rc == -EBLACKLISTED)
|
if (rc == -EBLOCKLISTED)
|
||||||
ceph_inode_to_client(inode)->blacklisted = true;
|
ceph_inode_to_client(inode)->blocklisted = true;
|
||||||
|
|
||||||
/* unlock all pages, zeroing any data we didn't read */
|
/* unlock all pages, zeroing any data we didn't read */
|
||||||
osd_data = osd_req_op_extent_osd_data(req, 0);
|
osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||||
@ -619,50 +590,6 @@ static u64 get_writepages_data_length(struct inode *inode,
|
|||||||
return end > start ? end - start : 0;
|
return end > start ? end - start : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* do a synchronous write on N pages
|
|
||||||
*/
|
|
||||||
static int ceph_sync_writepages(struct ceph_fs_client *fsc,
|
|
||||||
struct ceph_vino vino,
|
|
||||||
struct ceph_file_layout *layout,
|
|
||||||
struct ceph_snap_context *snapc,
|
|
||||||
u64 off, u64 len,
|
|
||||||
u32 truncate_seq, u64 truncate_size,
|
|
||||||
struct timespec64 *mtime,
|
|
||||||
struct page **pages, int num_pages)
|
|
||||||
{
|
|
||||||
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
|
||||||
struct ceph_osd_request *req;
|
|
||||||
int rc = 0;
|
|
||||||
int page_align = off & ~PAGE_MASK;
|
|
||||||
|
|
||||||
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
|
|
||||||
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
|
|
||||||
snapc, truncate_seq, truncate_size,
|
|
||||||
true);
|
|
||||||
if (IS_ERR(req))
|
|
||||||
return PTR_ERR(req);
|
|
||||||
|
|
||||||
/* it may be a short write due to an object boundary */
|
|
||||||
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
|
|
||||||
false, false);
|
|
||||||
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
|
|
||||||
|
|
||||||
req->r_mtime = *mtime;
|
|
||||||
rc = ceph_osdc_start_request(osdc, req, true);
|
|
||||||
if (!rc)
|
|
||||||
rc = ceph_osdc_wait_request(osdc, req);
|
|
||||||
|
|
||||||
ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
|
|
||||||
req->r_end_latency, rc);
|
|
||||||
|
|
||||||
ceph_osdc_put_request(req);
|
|
||||||
if (rc == 0)
|
|
||||||
rc = len;
|
|
||||||
dout("writepages result %d\n", rc);
|
|
||||||
return rc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Write a single page, but leave the page locked.
|
* Write a single page, but leave the page locked.
|
||||||
*
|
*
|
||||||
@ -671,20 +598,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc,
|
|||||||
*/
|
*/
|
||||||
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
||||||
{
|
{
|
||||||
struct inode *inode;
|
struct inode *inode = page->mapping->host;
|
||||||
struct ceph_inode_info *ci;
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_fs_client *fsc;
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||||
struct ceph_snap_context *snapc, *oldest;
|
struct ceph_snap_context *snapc, *oldest;
|
||||||
loff_t page_off = page_offset(page);
|
loff_t page_off = page_offset(page);
|
||||||
int err, len = PAGE_SIZE;
|
int err;
|
||||||
|
loff_t len = PAGE_SIZE;
|
||||||
struct ceph_writeback_ctl ceph_wbc;
|
struct ceph_writeback_ctl ceph_wbc;
|
||||||
|
struct ceph_osd_client *osdc = &fsc->client->osdc;
|
||||||
|
struct ceph_osd_request *req;
|
||||||
|
|
||||||
dout("writepage %p idx %lu\n", page, page->index);
|
dout("writepage %p idx %lu\n", page, page->index);
|
||||||
|
|
||||||
inode = page->mapping->host;
|
|
||||||
ci = ceph_inode(inode);
|
|
||||||
fsc = ceph_inode_to_client(inode);
|
|
||||||
|
|
||||||
/* verify this is a writeable snap context */
|
/* verify this is a writeable snap context */
|
||||||
snapc = page_snap_context(page);
|
snapc = page_snap_context(page);
|
||||||
if (!snapc) {
|
if (!snapc) {
|
||||||
@ -713,7 +639,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
|||||||
if (ceph_wbc.i_size < page_off + len)
|
if (ceph_wbc.i_size < page_off + len)
|
||||||
len = ceph_wbc.i_size - page_off;
|
len = ceph_wbc.i_size - page_off;
|
||||||
|
|
||||||
dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
|
dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
|
||||||
inode, page, page->index, page_off, len, snapc, snapc->seq);
|
inode, page, page->index, page_off, len, snapc, snapc->seq);
|
||||||
|
|
||||||
if (atomic_long_inc_return(&fsc->writeback_count) >
|
if (atomic_long_inc_return(&fsc->writeback_count) >
|
||||||
@ -721,11 +647,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
|||||||
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
|
||||||
|
|
||||||
set_page_writeback(page);
|
set_page_writeback(page);
|
||||||
err = ceph_sync_writepages(fsc, ceph_vino(inode),
|
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
|
||||||
&ci->i_layout, snapc, page_off, len,
|
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
|
||||||
ceph_wbc.truncate_seq,
|
ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
|
||||||
ceph_wbc.truncate_size,
|
true);
|
||||||
&inode->i_mtime, &page, 1);
|
if (IS_ERR(req)) {
|
||||||
|
redirty_page_for_writepage(wbc, page);
|
||||||
|
end_page_writeback(page);
|
||||||
|
return PTR_ERR(req);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* it may be a short write due to an object boundary */
|
||||||
|
WARN_ON_ONCE(len > PAGE_SIZE);
|
||||||
|
osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
|
||||||
|
dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
|
||||||
|
|
||||||
|
req->r_mtime = inode->i_mtime;
|
||||||
|
err = ceph_osdc_start_request(osdc, req, true);
|
||||||
|
if (!err)
|
||||||
|
err = ceph_osdc_wait_request(osdc, req);
|
||||||
|
|
||||||
|
ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency,
|
||||||
|
req->r_end_latency, err);
|
||||||
|
|
||||||
|
ceph_osdc_put_request(req);
|
||||||
|
if (err == 0)
|
||||||
|
err = len;
|
||||||
|
|
||||||
if (err < 0) {
|
if (err < 0) {
|
||||||
struct writeback_control tmp_wbc;
|
struct writeback_control tmp_wbc;
|
||||||
if (!wbc)
|
if (!wbc)
|
||||||
@ -737,8 +685,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
|||||||
end_page_writeback(page);
|
end_page_writeback(page);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
if (err == -EBLACKLISTED)
|
if (err == -EBLOCKLISTED)
|
||||||
fsc->blacklisted = true;
|
fsc->blocklisted = true;
|
||||||
dout("writepage setting page/mapping error %d %p\n",
|
dout("writepage setting page/mapping error %d %p\n",
|
||||||
err, page);
|
err, page);
|
||||||
mapping_set_error(&inode->i_data, err);
|
mapping_set_error(&inode->i_data, err);
|
||||||
@ -801,8 +749,8 @@ static void writepages_finish(struct ceph_osd_request *req)
|
|||||||
if (rc < 0) {
|
if (rc < 0) {
|
||||||
mapping_set_error(mapping, rc);
|
mapping_set_error(mapping, rc);
|
||||||
ceph_set_error_write(ci);
|
ceph_set_error_write(ci);
|
||||||
if (rc == -EBLACKLISTED)
|
if (rc == -EBLOCKLISTED)
|
||||||
fsc->blacklisted = true;
|
fsc->blocklisted = true;
|
||||||
} else {
|
} else {
|
||||||
ceph_clear_error_write(ci);
|
ceph_clear_error_write(ci);
|
||||||
}
|
}
|
||||||
@ -962,9 +910,8 @@ retry:
|
|||||||
max_pages = wsize >> PAGE_SHIFT;
|
max_pages = wsize >> PAGE_SHIFT;
|
||||||
|
|
||||||
get_more_pages:
|
get_more_pages:
|
||||||
pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index,
|
pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
|
||||||
end, PAGECACHE_TAG_DIRTY,
|
end, PAGECACHE_TAG_DIRTY);
|
||||||
max_pages - locked_pages);
|
|
||||||
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
|
dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
|
||||||
if (!pvec_pages && !locked_pages)
|
if (!pvec_pages && !locked_pages)
|
||||||
break;
|
break;
|
||||||
@ -1299,110 +1246,60 @@ static int context_is_writeable_or_written(struct inode *inode,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* We are only allowed to write into/dirty the page if the page is
|
* ceph_find_incompatible - find an incompatible context and return it
|
||||||
* clean, or already dirty within the same snap context.
|
* @page: page being dirtied
|
||||||
*
|
*
|
||||||
* called with page locked.
|
* We are only allowed to write into/dirty a page if the page is
|
||||||
* return success with page locked,
|
* clean, or already dirty within the same snap context. Returns a
|
||||||
* or any failure (incl -EAGAIN) with page unlocked.
|
* conflicting context if there is one, NULL if there isn't, or a
|
||||||
|
* negative error code on other errors.
|
||||||
|
*
|
||||||
|
* Must be called with page lock held.
|
||||||
*/
|
*/
|
||||||
static int ceph_update_writeable_page(struct file *file,
|
static struct ceph_snap_context *
|
||||||
loff_t pos, unsigned len,
|
ceph_find_incompatible(struct page *page)
|
||||||
struct page *page)
|
|
||||||
{
|
{
|
||||||
struct inode *inode = file_inode(file);
|
struct inode *inode = page->mapping->host;
|
||||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
loff_t page_off = pos & PAGE_MASK;
|
|
||||||
int pos_in_page = pos & ~PAGE_MASK;
|
|
||||||
int end_in_page = pos_in_page + len;
|
|
||||||
loff_t i_size;
|
|
||||||
int r;
|
|
||||||
struct ceph_snap_context *snapc, *oldest;
|
|
||||||
|
|
||||||
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
|
if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
|
||||||
dout(" page %p forced umount\n", page);
|
dout(" page %p forced umount\n", page);
|
||||||
unlock_page(page);
|
return ERR_PTR(-EIO);
|
||||||
return -EIO;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
retry_locked:
|
for (;;) {
|
||||||
/* writepages currently holds page lock, but if we change that later, */
|
struct ceph_snap_context *snapc, *oldest;
|
||||||
wait_on_page_writeback(page);
|
|
||||||
|
wait_on_page_writeback(page);
|
||||||
|
|
||||||
|
snapc = page_snap_context(page);
|
||||||
|
if (!snapc || snapc == ci->i_head_snapc)
|
||||||
|
break;
|
||||||
|
|
||||||
snapc = page_snap_context(page);
|
|
||||||
if (snapc && snapc != ci->i_head_snapc) {
|
|
||||||
/*
|
/*
|
||||||
* this page is already dirty in another (older) snap
|
* this page is already dirty in another (older) snap
|
||||||
* context! is it writeable now?
|
* context! is it writeable now?
|
||||||
*/
|
*/
|
||||||
oldest = get_oldest_context(inode, NULL, NULL);
|
oldest = get_oldest_context(inode, NULL, NULL);
|
||||||
if (snapc->seq > oldest->seq) {
|
if (snapc->seq > oldest->seq) {
|
||||||
|
/* not writeable -- return it for the caller to deal with */
|
||||||
ceph_put_snap_context(oldest);
|
ceph_put_snap_context(oldest);
|
||||||
dout(" page %p snapc %p not current or oldest\n",
|
dout(" page %p snapc %p not current or oldest\n", page, snapc);
|
||||||
page, snapc);
|
return ceph_get_snap_context(snapc);
|
||||||
/*
|
|
||||||
* queue for writeback, and wait for snapc to
|
|
||||||
* be writeable or written
|
|
||||||
*/
|
|
||||||
snapc = ceph_get_snap_context(snapc);
|
|
||||||
unlock_page(page);
|
|
||||||
ceph_queue_writeback(inode);
|
|
||||||
r = wait_event_killable(ci->i_cap_wq,
|
|
||||||
context_is_writeable_or_written(inode, snapc));
|
|
||||||
ceph_put_snap_context(snapc);
|
|
||||||
if (r == -ERESTARTSYS)
|
|
||||||
return r;
|
|
||||||
return -EAGAIN;
|
|
||||||
}
|
}
|
||||||
ceph_put_snap_context(oldest);
|
ceph_put_snap_context(oldest);
|
||||||
|
|
||||||
/* yay, writeable, do it now (without dropping page lock) */
|
/* yay, writeable, do it now (without dropping page lock) */
|
||||||
dout(" page %p snapc %p not current, but oldest\n",
|
dout(" page %p snapc %p not current, but oldest\n", page, snapc);
|
||||||
page, snapc);
|
if (clear_page_dirty_for_io(page)) {
|
||||||
if (!clear_page_dirty_for_io(page))
|
int r = writepage_nounlock(page, NULL);
|
||||||
goto retry_locked;
|
if (r < 0)
|
||||||
r = writepage_nounlock(page, NULL);
|
return ERR_PTR(r);
|
||||||
if (r < 0)
|
}
|
||||||
goto fail_unlock;
|
|
||||||
goto retry_locked;
|
|
||||||
}
|
}
|
||||||
|
return NULL;
|
||||||
if (PageUptodate(page)) {
|
|
||||||
dout(" page %p already uptodate\n", page);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* full page? */
|
|
||||||
if (pos_in_page == 0 && len == PAGE_SIZE)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* past end of file? */
|
|
||||||
i_size = i_size_read(inode);
|
|
||||||
|
|
||||||
if (page_off >= i_size ||
|
|
||||||
(pos_in_page == 0 && (pos+len) >= i_size &&
|
|
||||||
end_in_page - pos_in_page != PAGE_SIZE)) {
|
|
||||||
dout(" zeroing %p 0 - %d and %d - %d\n",
|
|
||||||
page, pos_in_page, end_in_page, (int)PAGE_SIZE);
|
|
||||||
zero_user_segments(page,
|
|
||||||
0, pos_in_page,
|
|
||||||
end_in_page, PAGE_SIZE);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* we need to read it. */
|
|
||||||
r = ceph_do_readpage(file, page);
|
|
||||||
if (r < 0) {
|
|
||||||
if (r == -EINPROGRESS)
|
|
||||||
return -EAGAIN;
|
|
||||||
goto fail_unlock;
|
|
||||||
}
|
|
||||||
goto retry_locked;
|
|
||||||
fail_unlock:
|
|
||||||
unlock_page(page);
|
|
||||||
return r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1414,26 +1311,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
|
|||||||
struct page **pagep, void **fsdata)
|
struct page **pagep, void **fsdata)
|
||||||
{
|
{
|
||||||
struct inode *inode = file_inode(file);
|
struct inode *inode = file_inode(file);
|
||||||
struct page *page;
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_snap_context *snapc;
|
||||||
|
struct page *page = NULL;
|
||||||
pgoff_t index = pos >> PAGE_SHIFT;
|
pgoff_t index = pos >> PAGE_SHIFT;
|
||||||
int r;
|
int pos_in_page = pos & ~PAGE_MASK;
|
||||||
|
int r = 0;
|
||||||
|
|
||||||
do {
|
dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
|
||||||
/* get a page */
|
|
||||||
|
for (;;) {
|
||||||
page = grab_cache_page_write_begin(mapping, index, 0);
|
page = grab_cache_page_write_begin(mapping, index, 0);
|
||||||
if (!page)
|
if (!page) {
|
||||||
return -ENOMEM;
|
r = -ENOMEM;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
dout("write_begin file %p inode %p page %p %d~%d\n", file,
|
snapc = ceph_find_incompatible(page);
|
||||||
inode, page, (int)pos, (int)len);
|
if (snapc) {
|
||||||
|
if (IS_ERR(snapc)) {
|
||||||
r = ceph_update_writeable_page(file, pos, len, page);
|
r = PTR_ERR(snapc);
|
||||||
if (r < 0)
|
break;
|
||||||
|
}
|
||||||
|
unlock_page(page);
|
||||||
put_page(page);
|
put_page(page);
|
||||||
else
|
page = NULL;
|
||||||
*pagep = page;
|
ceph_queue_writeback(inode);
|
||||||
} while (r == -EAGAIN);
|
r = wait_event_killable(ci->i_cap_wq,
|
||||||
|
context_is_writeable_or_written(inode, snapc));
|
||||||
|
ceph_put_snap_context(snapc);
|
||||||
|
if (r != 0)
|
||||||
|
break;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PageUptodate(page)) {
|
||||||
|
dout(" page %p already uptodate\n", page);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In some cases we don't need to read at all:
|
||||||
|
* - full page write
|
||||||
|
* - write that lies completely beyond EOF
|
||||||
|
* - write that covers the the page from start to EOF or beyond it
|
||||||
|
*/
|
||||||
|
if ((pos_in_page == 0 && len == PAGE_SIZE) ||
|
||||||
|
(pos >= i_size_read(inode)) ||
|
||||||
|
(pos_in_page == 0 && (pos + len) >= i_size_read(inode))) {
|
||||||
|
zero_user_segments(page, 0, pos_in_page,
|
||||||
|
pos_in_page + len, PAGE_SIZE);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We need to read it. If we get back -EINPROGRESS, then the page was
|
||||||
|
* handed off to fscache and it will be unlocked when the read completes.
|
||||||
|
* Refind the page in that case so we can reacquire the page lock. Otherwise
|
||||||
|
* we got a hard error or the read was completed synchronously.
|
||||||
|
*/
|
||||||
|
r = ceph_do_readpage(file, page);
|
||||||
|
if (r != -EINPROGRESS)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (r < 0) {
|
||||||
|
if (page) {
|
||||||
|
unlock_page(page);
|
||||||
|
put_page(page);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*pagep = page;
|
||||||
|
}
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1522,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
|
|||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_file_info *fi = vma->vm_file->private_data;
|
struct ceph_file_info *fi = vma->vm_file->private_data;
|
||||||
struct page *pinned_page = NULL;
|
struct page *pinned_page = NULL;
|
||||||
loff_t off = vmf->pgoff << PAGE_SHIFT;
|
loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
|
||||||
int want, got, err;
|
int want, got, err;
|
||||||
sigset_t oldset;
|
sigset_t oldset;
|
||||||
vm_fault_t ret = VM_FAULT_SIGBUS;
|
vm_fault_t ret = VM_FAULT_SIGBUS;
|
||||||
@ -1668,6 +1617,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
|
|||||||
inode_inc_iversion_raw(inode);
|
inode_inc_iversion_raw(inode);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
struct ceph_snap_context *snapc;
|
||||||
|
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
|
|
||||||
if (page_mkwrite_check_truncate(page, inode) < 0) {
|
if (page_mkwrite_check_truncate(page, inode) < 0) {
|
||||||
@ -1676,13 +1627,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = ceph_update_writeable_page(vma->vm_file, off, len, page);
|
snapc = ceph_find_incompatible(page);
|
||||||
if (err >= 0) {
|
if (!snapc) {
|
||||||
/* success. we'll keep the page locked. */
|
/* success. we'll keep the page locked. */
|
||||||
set_page_dirty(page);
|
set_page_dirty(page);
|
||||||
ret = VM_FAULT_LOCKED;
|
ret = VM_FAULT_LOCKED;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
} while (err == -EAGAIN);
|
|
||||||
|
unlock_page(page);
|
||||||
|
|
||||||
|
if (IS_ERR(snapc)) {
|
||||||
|
ret = VM_FAULT_SIGBUS;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ceph_queue_writeback(inode);
|
||||||
|
err = wait_event_killable(ci->i_cap_wq,
|
||||||
|
context_is_writeable_or_written(inode, snapc));
|
||||||
|
ceph_put_snap_context(snapc);
|
||||||
|
} while (err == 0);
|
||||||
|
|
||||||
if (ret == VM_FAULT_LOCKED ||
|
if (ret == VM_FAULT_LOCKED ||
|
||||||
ci->i_inline_version != CEPH_INLINE_NONE) {
|
ci->i_inline_version != CEPH_INLINE_NONE) {
|
||||||
@ -2039,16 +2003,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
|
|||||||
if (err >= 0 || err == -ENOENT)
|
if (err >= 0 || err == -ENOENT)
|
||||||
have |= POOL_READ;
|
have |= POOL_READ;
|
||||||
else if (err != -EPERM) {
|
else if (err != -EPERM) {
|
||||||
if (err == -EBLACKLISTED)
|
if (err == -EBLOCKLISTED)
|
||||||
fsc->blacklisted = true;
|
fsc->blocklisted = true;
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (err2 == 0 || err2 == -EEXIST)
|
if (err2 == 0 || err2 == -EEXIST)
|
||||||
have |= POOL_WRITE;
|
have |= POOL_WRITE;
|
||||||
else if (err2 != -EPERM) {
|
else if (err2 != -EPERM) {
|
||||||
if (err2 == -EBLACKLISTED)
|
if (err2 == -EBLOCKLISTED)
|
||||||
fsc->blacklisted = true;
|
fsc->blocklisted = true;
|
||||||
err = err2;
|
err = err2;
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
128
fs/ceph/caps.c
128
fs/ceph/caps.c
@ -1222,36 +1222,27 @@ struct cap_msg_args {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Build and send a cap message to the given MDS.
|
* cap struct size + flock buffer size + inline version + inline data size +
|
||||||
*
|
* osd_epoch_barrier + oldest_flush_tid
|
||||||
* Caller should be holding s_mutex.
|
|
||||||
*/
|
*/
|
||||||
static int send_cap_msg(struct cap_msg_args *arg)
|
#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
|
||||||
|
4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
|
||||||
|
|
||||||
|
/* Marshal up the cap msg to the MDS */
|
||||||
|
static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
|
||||||
{
|
{
|
||||||
struct ceph_mds_caps *fc;
|
struct ceph_mds_caps *fc;
|
||||||
struct ceph_msg *msg;
|
|
||||||
void *p;
|
void *p;
|
||||||
size_t extra_len;
|
|
||||||
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
|
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
|
||||||
|
|
||||||
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
|
dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
|
||||||
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
|
__func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
|
||||||
" xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
|
ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
|
||||||
arg->cid, arg->ino, ceph_cap_string(arg->caps),
|
ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
|
||||||
ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
|
arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
|
||||||
arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
|
arg->size, arg->max_size, arg->xattr_version,
|
||||||
arg->mseq, arg->follows, arg->size, arg->max_size,
|
|
||||||
arg->xattr_version,
|
|
||||||
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
|
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
|
||||||
|
|
||||||
/* flock buffer size + inline version + inline data size +
|
|
||||||
* osd_epoch_barrier + oldest_flush_tid */
|
|
||||||
extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
|
|
||||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
|
|
||||||
GFP_NOFS, false);
|
|
||||||
if (!msg)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
msg->hdr.version = cpu_to_le16(10);
|
msg->hdr.version = cpu_to_le16(10);
|
||||||
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
|
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
|
||||||
|
|
||||||
@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
|
|||||||
|
|
||||||
/* Advisory flags (version 10) */
|
/* Advisory flags (version 10) */
|
||||||
ceph_encode_32(&p, arg->flags);
|
ceph_encode_32(&p, arg->flags);
|
||||||
|
|
||||||
ceph_con_send(&arg->session->s_con, msg);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
|
|||||||
*
|
*
|
||||||
* Caller should hold snap_rwsem (read), s_mutex.
|
* Caller should hold snap_rwsem (read), s_mutex.
|
||||||
*/
|
*/
|
||||||
static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
|
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
|
||||||
struct ceph_inode_info *ci)
|
|
||||||
{
|
{
|
||||||
|
struct ceph_msg *msg;
|
||||||
struct inode *inode = &ci->vfs_inode;
|
struct inode *inode = &ci->vfs_inode;
|
||||||
int ret;
|
|
||||||
|
|
||||||
ret = send_cap_msg(arg);
|
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
|
||||||
if (ret < 0) {
|
if (!msg) {
|
||||||
pr_err("error sending cap msg, ino (%llx.%llx) "
|
pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
|
||||||
"flushing %s tid %llu, requeue\n",
|
|
||||||
ceph_vinop(inode), ceph_cap_string(arg->dirty),
|
ceph_vinop(inode), ceph_cap_string(arg->dirty),
|
||||||
arg->flush_tid);
|
arg->flush_tid);
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
__cap_delay_requeue(mdsc, ci);
|
__cap_delay_requeue(arg->session->s_mdsc, ci);
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
encode_cap_msg(msg, arg);
|
||||||
|
ceph_con_send(&arg->session->s_con, msg);
|
||||||
ceph_buffer_put(arg->old_xattr_buf);
|
ceph_buffer_put(arg->old_xattr_buf);
|
||||||
|
|
||||||
if (arg->wake)
|
if (arg->wake)
|
||||||
wake_up_all(&ci->i_cap_wq);
|
wake_up_all(&ci->i_cap_wq);
|
||||||
}
|
}
|
||||||
@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
|
|||||||
u32 mseq, u64 oldest_flush_tid)
|
u32 mseq, u64 oldest_flush_tid)
|
||||||
{
|
{
|
||||||
struct cap_msg_args arg;
|
struct cap_msg_args arg;
|
||||||
|
struct ceph_msg *msg;
|
||||||
|
|
||||||
|
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
|
||||||
|
if (!msg)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
arg.session = session;
|
arg.session = session;
|
||||||
arg.ino = ceph_vino(inode).ino;
|
arg.ino = ceph_vino(inode).ino;
|
||||||
@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
|
|||||||
arg.flags = 0;
|
arg.flags = 0;
|
||||||
arg.wake = false;
|
arg.wake = false;
|
||||||
|
|
||||||
return send_cap_msg(&arg);
|
encode_cap_msg(msg, &arg);
|
||||||
|
ceph_con_send(&arg.session->s_con, msg);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
|
|||||||
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
||||||
struct ceph_mds_session *session)
|
struct ceph_mds_session *session)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
|
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct inode *inode = &ci->vfs_inode;
|
struct inode *inode = &ci->vfs_inode;
|
||||||
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
struct ceph_cap *cap;
|
struct ceph_cap *cap;
|
||||||
u64 flush_tid, oldest_flush_tid;
|
u64 flush_tid, oldest_flush_tid;
|
||||||
int file_wanted, used, cap_used;
|
int file_wanted, used, cap_used;
|
||||||
@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
|
|||||||
retry:
|
retry:
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
retry_locked:
|
retry_locked:
|
||||||
|
/* Caps wanted by virtue of active open files. */
|
||||||
file_wanted = __ceph_caps_file_wanted(ci);
|
file_wanted = __ceph_caps_file_wanted(ci);
|
||||||
|
|
||||||
|
/* Caps which have active references against them */
|
||||||
used = __ceph_caps_used(ci);
|
used = __ceph_caps_used(ci);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* "issued" represents the current caps that the MDS wants us to have.
|
||||||
|
* "implemented" is the set that we have been granted, and includes the
|
||||||
|
* ones that have not yet been returned to the MDS (the "revoking" set,
|
||||||
|
* usually because they have outstanding references).
|
||||||
|
*/
|
||||||
issued = __ceph_caps_issued(ci, &implemented);
|
issued = __ceph_caps_issued(ci, &implemented);
|
||||||
revoking = implemented & ~issued;
|
revoking = implemented & ~issued;
|
||||||
|
|
||||||
want = file_wanted;
|
want = file_wanted;
|
||||||
|
|
||||||
|
/* The ones we currently want to retain (may be adjusted below) */
|
||||||
retain = file_wanted | used | CEPH_CAP_PIN;
|
retain = file_wanted | used | CEPH_CAP_PIN;
|
||||||
if (!mdsc->stopping && inode->i_nlink > 0) {
|
if (!mdsc->stopping && inode->i_nlink > 0) {
|
||||||
if (file_wanted) {
|
if (file_wanted) {
|
||||||
@ -2011,6 +2017,10 @@ retry_locked:
|
|||||||
|
|
||||||
/* NOTE: no side-effects allowed, until we take s_mutex */
|
/* NOTE: no side-effects allowed, until we take s_mutex */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we have an auth cap, we don't need to consider any
|
||||||
|
* overlapping caps as used.
|
||||||
|
*/
|
||||||
cap_used = used;
|
cap_used = used;
|
||||||
if (ci->i_auth_cap && cap != ci->i_auth_cap)
|
if (ci->i_auth_cap && cap != ci->i_auth_cap)
|
||||||
cap_used &= ~ci->i_auth_cap->issued;
|
cap_used &= ~ci->i_auth_cap->issued;
|
||||||
@ -2148,7 +2158,7 @@ ack:
|
|||||||
want, retain, flushing, flush_tid, oldest_flush_tid);
|
want, retain, flushing, flush_tid, oldest_flush_tid);
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
|
||||||
__send_cap(mdsc, &arg, ci);
|
__send_cap(&arg, ci);
|
||||||
|
|
||||||
goto retry; /* retake i_ceph_lock and restart our cap scan. */
|
goto retry; /* retake i_ceph_lock and restart our cap scan. */
|
||||||
}
|
}
|
||||||
@ -2222,7 +2232,7 @@ retry_locked:
|
|||||||
flushing, flush_tid, oldest_flush_tid);
|
flushing, flush_tid, oldest_flush_tid);
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
|
|
||||||
__send_cap(mdsc, &arg, ci);
|
__send_cap(&arg, ci);
|
||||||
} else {
|
} else {
|
||||||
if (!list_empty(&ci->i_cap_flush_list)) {
|
if (!list_empty(&ci->i_cap_flush_list)) {
|
||||||
struct ceph_cap_flush *cf =
|
struct ceph_cap_flush *cf =
|
||||||
@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
|
|||||||
(cap->issued | cap->implemented),
|
(cap->issued | cap->implemented),
|
||||||
cf->caps, cf->tid, oldest_flush_tid);
|
cf->caps, cf->tid, oldest_flush_tid);
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
__send_cap(mdsc, &arg, ci);
|
__send_cap(&arg, ci);
|
||||||
} else {
|
} else {
|
||||||
struct ceph_cap_snap *capsnap =
|
struct ceph_cap_snap *capsnap =
|
||||||
container_of(cf, struct ceph_cap_snap,
|
container_of(cf, struct ceph_cap_snap,
|
||||||
@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
|
|||||||
|
|
||||||
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
||||||
{
|
{
|
||||||
int i;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
|
||||||
int bits = (fmode << 1) | 1;
|
int bits = (fmode << 1) | 1;
|
||||||
|
bool is_opened = false;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (count == 1)
|
||||||
|
atomic64_inc(&mdsc->metric.opened_files);
|
||||||
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
|
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
|
||||||
if (bits & (1 << i))
|
if (bits & (1 << i))
|
||||||
ci->i_nr_by_mode[i] += count;
|
ci->i_nr_by_mode[i] += count;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If any of the mode ref is larger than 1,
|
||||||
|
* that means it has been already opened by
|
||||||
|
* others. Just skip checking the PIN ref.
|
||||||
|
*/
|
||||||
|
if (i && ci->i_nr_by_mode[i] > 1)
|
||||||
|
is_opened = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!is_opened)
|
||||||
|
percpu_counter_inc(&mdsc->metric.opened_inodes);
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
|||||||
*/
|
*/
|
||||||
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
|
||||||
{
|
{
|
||||||
int i;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
|
||||||
int bits = (fmode << 1) | 1;
|
int bits = (fmode << 1) | 1;
|
||||||
|
bool is_closed = true;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (count == 1)
|
||||||
|
atomic64_dec(&mdsc->metric.opened_files);
|
||||||
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
|
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
|
||||||
if (bits & (1 << i)) {
|
if (bits & (1 << i)) {
|
||||||
BUG_ON(ci->i_nr_by_mode[i] < count);
|
BUG_ON(ci->i_nr_by_mode[i] < count);
|
||||||
ci->i_nr_by_mode[i] -= count;
|
ci->i_nr_by_mode[i] -= count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If any of the mode ref is not 0 after
|
||||||
|
* decreased, that means it is still opened
|
||||||
|
* by others. Just skip checking the PIN ref.
|
||||||
|
*/
|
||||||
|
if (i && ci->i_nr_by_mode[i])
|
||||||
|
is_closed = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_closed)
|
||||||
|
percpu_counter_dec(&mdsc->metric.opened_inodes);
|
||||||
spin_unlock(&ci->i_ceph_lock);
|
spin_unlock(&ci->i_ceph_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p)
|
|||||||
int nr_caps = 0;
|
int nr_caps = 0;
|
||||||
s64 total, sum, avg, min, max, sq;
|
s64 total, sum, avg, min, max, sq;
|
||||||
|
|
||||||
|
sum = percpu_counter_sum(&m->total_inodes);
|
||||||
|
seq_printf(s, "item total\n");
|
||||||
|
seq_printf(s, "------------------------------------------\n");
|
||||||
|
seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes",
|
||||||
|
atomic64_read(&m->opened_files), sum);
|
||||||
|
seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
|
||||||
|
atomic64_read(&m->total_caps), sum);
|
||||||
|
seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
|
||||||
|
percpu_counter_sum(&m->opened_inodes), sum);
|
||||||
|
|
||||||
|
seq_printf(s, "\n");
|
||||||
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
|
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
|
||||||
seq_printf(s, "-----------------------------------------------------------------------------------\n");
|
seq_printf(s, "-----------------------------------------------------------------------------------\n");
|
||||||
|
|
||||||
@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
|
|||||||
{
|
{
|
||||||
struct seq_file *s = p;
|
struct seq_file *s = p;
|
||||||
|
|
||||||
seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
|
seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
|
||||||
|
cap->session->s_mds,
|
||||||
ceph_cap_string(cap->issued),
|
ceph_cap_string(cap->issued),
|
||||||
ceph_cap_string(cap->implemented));
|
ceph_cap_string(cap->implemented));
|
||||||
return 0;
|
return 0;
|
||||||
@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p)
|
|||||||
"reserved\t%d\n"
|
"reserved\t%d\n"
|
||||||
"min\t\t%d\n\n",
|
"min\t\t%d\n\n",
|
||||||
total, avail, used, reserved, min);
|
total, avail, used, reserved, min);
|
||||||
seq_printf(s, "ino issued implemented\n");
|
seq_printf(s, "ino mds issued implemented\n");
|
||||||
seq_printf(s, "-----------------------------------------------\n");
|
seq_printf(s, "--------------------------------------------------\n");
|
||||||
|
|
||||||
mutex_lock(&mdsc->mutex);
|
mutex_lock(&mdsc->mutex);
|
||||||
for (i = 0; i < mdsc->max_sessions; i++) {
|
for (i = 0; i < mdsc->max_sessions; i++) {
|
||||||
|
@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
|
|||||||
static int ceph_d_init(struct dentry *dentry)
|
static int ceph_d_init(struct dentry *dentry)
|
||||||
{
|
{
|
||||||
struct ceph_dentry_info *di;
|
struct ceph_dentry_info *di;
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
|
|
||||||
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
|
di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
|
||||||
if (!di)
|
if (!di)
|
||||||
@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
|
|||||||
unsigned int flags)
|
unsigned int flags)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
int op;
|
int op;
|
||||||
int mask;
|
int mask;
|
||||||
@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
|
|||||||
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
|
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
|
||||||
umode_t mode, dev_t rdev)
|
umode_t mode, dev_t rdev)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
struct ceph_acl_sec_ctx as_ctx = {};
|
struct ceph_acl_sec_ctx as_ctx = {};
|
||||||
int err;
|
int err;
|
||||||
@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|||||||
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
|
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
|
||||||
const char *dest)
|
const char *dest)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
struct ceph_acl_sec_ctx as_ctx = {};
|
struct ceph_acl_sec_ctx as_ctx = {};
|
||||||
int err;
|
int err;
|
||||||
@ -942,8 +939,7 @@ out:
|
|||||||
|
|
||||||
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
struct ceph_acl_sec_ctx as_ctx = {};
|
struct ceph_acl_sec_ctx as_ctx = {};
|
||||||
int err = -EROFS;
|
int err = -EROFS;
|
||||||
@ -1010,8 +1006,7 @@ out:
|
|||||||
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
|
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
|
||||||
struct dentry *dentry)
|
struct dentry *dentry)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
|
|||||||
struct inode *new_dir, struct dentry *new_dentry,
|
struct inode *new_dir, struct dentry *new_dentry,
|
||||||
unsigned int flags)
|
unsigned int flags)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
int op = CEPH_MDS_OP_RENAME;
|
int op = CEPH_MDS_OP_RENAME;
|
||||||
int err;
|
int err;
|
||||||
|
@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
|
|||||||
static struct ceph_mds_request *
|
static struct ceph_mds_request *
|
||||||
prepare_open_request(struct super_block *sb, int flags, int create_mode)
|
prepare_open_request(struct super_block *sb, int flags, int create_mode)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
|
||||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
int want_auth = USE_ANY_MDS;
|
int want_auth = USE_ANY_MDS;
|
||||||
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
|
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
|
||||||
@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
|
|||||||
case S_IFDIR:
|
case S_IFDIR:
|
||||||
ret = ceph_init_file_info(inode, file, fmode,
|
ret = ceph_init_file_info(inode, file, fmode,
|
||||||
S_ISDIR(inode->i_mode));
|
S_ISDIR(inode->i_mode));
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case S_IFLNK:
|
case S_IFLNK:
|
||||||
@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
|
|||||||
*/
|
*/
|
||||||
int ceph_renew_caps(struct inode *inode, int fmode)
|
int ceph_renew_caps(struct inode *inode, int fmode)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
int err, flags, wanted;
|
int err, flags, wanted;
|
||||||
@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
size_t page_off;
|
size_t page_off;
|
||||||
u64 i_size;
|
u64 i_size;
|
||||||
bool more;
|
bool more;
|
||||||
|
int idx;
|
||||||
|
size_t left;
|
||||||
|
|
||||||
req = ceph_osdc_new_request(osdc, &ci->i_layout,
|
req = ceph_osdc_new_request(osdc, &ci->i_layout,
|
||||||
ci->i_vino, off, &len, 0, 1,
|
ci->i_vino, off, &len, 0, 1,
|
||||||
@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
|
|
||||||
more = len < iov_iter_count(to);
|
more = len < iov_iter_count(to);
|
||||||
|
|
||||||
if (unlikely(iov_iter_is_pipe(to))) {
|
num_pages = calc_pages_for(off, len);
|
||||||
ret = iov_iter_get_pages_alloc(to, &pages, len,
|
page_off = off & ~PAGE_MASK;
|
||||||
&page_off);
|
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
|
||||||
if (ret <= 0) {
|
if (IS_ERR(pages)) {
|
||||||
ceph_osdc_put_request(req);
|
ceph_osdc_put_request(req);
|
||||||
ret = -ENOMEM;
|
ret = PTR_ERR(pages);
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
|
|
||||||
if (ret < len) {
|
|
||||||
len = ret;
|
|
||||||
osd_req_op_extent_update(req, 0, len);
|
|
||||||
more = false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
num_pages = calc_pages_for(off, len);
|
|
||||||
page_off = off & ~PAGE_MASK;
|
|
||||||
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
|
|
||||||
if (IS_ERR(pages)) {
|
|
||||||
ceph_osdc_put_request(req);
|
|
||||||
ret = PTR_ERR(pages);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
|
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
|
||||||
@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
ret += zlen;
|
ret += zlen;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (unlikely(iov_iter_is_pipe(to))) {
|
idx = 0;
|
||||||
if (ret > 0) {
|
left = ret > 0 ? ret : 0;
|
||||||
iov_iter_advance(to, ret);
|
while (left > 0) {
|
||||||
off += ret;
|
size_t len, copied;
|
||||||
} else {
|
page_off = off & ~PAGE_MASK;
|
||||||
iov_iter_advance(to, 0);
|
len = min_t(size_t, left, PAGE_SIZE - page_off);
|
||||||
|
SetPageUptodate(pages[idx]);
|
||||||
|
copied = copy_page_to_iter(pages[idx++],
|
||||||
|
page_off, len, to);
|
||||||
|
off += copied;
|
||||||
|
left -= copied;
|
||||||
|
if (copied < len) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
ceph_put_page_vector(pages, num_pages, false);
|
|
||||||
} else {
|
|
||||||
int idx = 0;
|
|
||||||
size_t left = ret > 0 ? ret : 0;
|
|
||||||
while (left > 0) {
|
|
||||||
size_t len, copied;
|
|
||||||
page_off = off & ~PAGE_MASK;
|
|
||||||
len = min_t(size_t, left, PAGE_SIZE - page_off);
|
|
||||||
copied = copy_page_to_iter(pages[idx++],
|
|
||||||
page_off, len, to);
|
|
||||||
off += copied;
|
|
||||||
left -= copied;
|
|
||||||
if (copied < len) {
|
|
||||||
ret = -EFAULT;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ceph_release_page_vector(pages, num_pages);
|
|
||||||
}
|
}
|
||||||
|
ceph_release_page_vector(pages, num_pages);
|
||||||
|
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
if (ret == -EBLACKLISTED)
|
if (ret == -EBLOCKLISTED)
|
||||||
fsc->blacklisted = true;
|
fsc->blocklisted = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
|
|||||||
struct inode *inode = req->r_inode;
|
struct inode *inode = req->r_inode;
|
||||||
struct ceph_aio_request *aio_req = req->r_priv;
|
struct ceph_aio_request *aio_req = req->r_priv;
|
||||||
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
|
||||||
struct ceph_client_metric *metric = &fsc->mdsc->metric;
|
|
||||||
|
|
||||||
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
|
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
|
||||||
BUG_ON(!osd_data->num_bvecs);
|
BUG_ON(!osd_data->num_bvecs);
|
||||||
|
@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work);
|
|||||||
static int ceph_set_ino_cb(struct inode *inode, void *data)
|
static int ceph_set_ino_cb(struct inode *inode, void *data)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
|
|
||||||
ci->i_vino = *(struct ceph_vino *)data;
|
ci->i_vino = *(struct ceph_vino *)data;
|
||||||
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
|
inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
|
||||||
inode_set_iversion_raw(inode, 0);
|
inode_set_iversion_raw(inode, 0);
|
||||||
|
percpu_counter_inc(&mdsc->metric.total_inodes);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode)
|
|||||||
void ceph_evict_inode(struct inode *inode)
|
void ceph_evict_inode(struct inode *inode)
|
||||||
{
|
{
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
struct ceph_inode_frag *frag;
|
struct ceph_inode_frag *frag;
|
||||||
struct rb_node *n;
|
struct rb_node *n;
|
||||||
|
|
||||||
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
|
dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
|
||||||
|
|
||||||
|
percpu_counter_dec(&mdsc->metric.total_inodes);
|
||||||
|
|
||||||
truncate_inode_pages_final(&inode->i_data);
|
truncate_inode_pages_final(&inode->i_data);
|
||||||
clear_inode(inode);
|
clear_inode(inode);
|
||||||
|
|
||||||
@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode)
|
|||||||
* caps in i_snap_caps.
|
* caps in i_snap_caps.
|
||||||
*/
|
*/
|
||||||
if (ci->i_snap_realm) {
|
if (ci->i_snap_realm) {
|
||||||
struct ceph_mds_client *mdsc =
|
|
||||||
ceph_inode_to_client(inode)->mdsc;
|
|
||||||
if (ceph_snap(inode) == CEPH_NOSNAP) {
|
if (ceph_snap(inode) == CEPH_NOSNAP) {
|
||||||
struct ceph_snap_realm *realm = ci->i_snap_realm;
|
struct ceph_snap_realm *realm = ci->i_snap_realm;
|
||||||
dout(" dropping residual ref to snap realm %p\n",
|
dout(" dropping residual ref to snap realm %p\n",
|
||||||
@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
|
|||||||
struct ceph_mds_session *session, int cap_fmode,
|
struct ceph_mds_session *session, int cap_fmode,
|
||||||
struct ceph_cap_reservation *caps_reservation)
|
struct ceph_cap_reservation *caps_reservation)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
struct ceph_mds_reply_inode *info = iinfo->in;
|
struct ceph_mds_reply_inode *info = iinfo->in;
|
||||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||||
int issued, new_issued, info_caps;
|
int issued, new_issued, info_caps;
|
||||||
|
@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
|
|||||||
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
|
static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
|
||||||
int cmd, u8 wait, struct file_lock *fl)
|
int cmd, u8 wait, struct file_lock *fl)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
struct ceph_mds_request *req;
|
struct ceph_mds_request *req;
|
||||||
int err;
|
int err;
|
||||||
u64 length = 0;
|
u64 length = 0;
|
||||||
|
@ -3303,7 +3303,7 @@ bad:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int __decode_session_metadata(void **p, void *end,
|
static int __decode_session_metadata(void **p, void *end,
|
||||||
bool *blacklisted)
|
bool *blocklisted)
|
||||||
{
|
{
|
||||||
/* map<string,string> */
|
/* map<string,string> */
|
||||||
u32 n;
|
u32 n;
|
||||||
@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end,
|
|||||||
*p += len;
|
*p += len;
|
||||||
ceph_decode_32_safe(p, end, len, bad);
|
ceph_decode_32_safe(p, end, len, bad);
|
||||||
ceph_decode_need(p, end, len, bad);
|
ceph_decode_need(p, end, len, bad);
|
||||||
|
/*
|
||||||
|
* Match "blocklisted (blacklisted)" from newer MDSes,
|
||||||
|
* or "blacklisted" from older MDSes.
|
||||||
|
*/
|
||||||
if (err_str && strnstr(*p, "blacklisted", len))
|
if (err_str && strnstr(*p, "blacklisted", len))
|
||||||
*blacklisted = true;
|
*blocklisted = true;
|
||||||
*p += len;
|
*p += len;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session,
|
|||||||
u32 op;
|
u32 op;
|
||||||
u64 seq, features = 0;
|
u64 seq, features = 0;
|
||||||
int wake = 0;
|
int wake = 0;
|
||||||
bool blacklisted = false;
|
bool blocklisted = false;
|
||||||
|
|
||||||
/* decode */
|
/* decode */
|
||||||
ceph_decode_need(&p, end, sizeof(*h), bad);
|
ceph_decode_need(&p, end, sizeof(*h), bad);
|
||||||
@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session,
|
|||||||
if (msg_version >= 3) {
|
if (msg_version >= 3) {
|
||||||
u32 len;
|
u32 len;
|
||||||
/* version >= 2, metadata */
|
/* version >= 2, metadata */
|
||||||
if (__decode_session_metadata(&p, end, &blacklisted) < 0)
|
if (__decode_session_metadata(&p, end, &blocklisted) < 0)
|
||||||
goto bad;
|
goto bad;
|
||||||
/* version >= 3, feature bits */
|
/* version >= 3, feature bits */
|
||||||
ceph_decode_32_safe(&p, end, len, bad);
|
ceph_decode_32_safe(&p, end, len, bad);
|
||||||
@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session,
|
|||||||
session->s_state = CEPH_MDS_SESSION_REJECTED;
|
session->s_state = CEPH_MDS_SESSION_REJECTED;
|
||||||
cleanup_session_requests(mdsc, session);
|
cleanup_session_requests(mdsc, session);
|
||||||
remove_session_caps(session);
|
remove_session_caps(session);
|
||||||
if (blacklisted)
|
if (blocklisted)
|
||||||
mdsc->fsc->blacklisted = true;
|
mdsc->fsc->blocklisted = true;
|
||||||
wake = 2; /* for good measure */
|
wake = 2; /* for good measure */
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -3612,6 +3616,39 @@ fail_msg:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct dentry* d_find_primary(struct inode *inode)
|
||||||
|
{
|
||||||
|
struct dentry *alias, *dn = NULL;
|
||||||
|
|
||||||
|
if (hlist_empty(&inode->i_dentry))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
spin_lock(&inode->i_lock);
|
||||||
|
if (hlist_empty(&inode->i_dentry))
|
||||||
|
goto out_unlock;
|
||||||
|
|
||||||
|
if (S_ISDIR(inode->i_mode)) {
|
||||||
|
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
|
||||||
|
if (!IS_ROOT(alias))
|
||||||
|
dn = dget(alias);
|
||||||
|
goto out_unlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
|
||||||
|
spin_lock(&alias->d_lock);
|
||||||
|
if (!d_unhashed(alias) &&
|
||||||
|
(ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
|
||||||
|
dn = dget_dlock(alias);
|
||||||
|
}
|
||||||
|
spin_unlock(&alias->d_lock);
|
||||||
|
if (dn)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
out_unlock:
|
||||||
|
spin_unlock(&inode->i_lock);
|
||||||
|
return dn;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Encode information about a cap for a reconnect with the MDS.
|
* Encode information about a cap for a reconnect with the MDS.
|
||||||
*/
|
*/
|
||||||
@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||||||
struct ceph_inode_info *ci = cap->ci;
|
struct ceph_inode_info *ci = cap->ci;
|
||||||
struct ceph_reconnect_state *recon_state = arg;
|
struct ceph_reconnect_state *recon_state = arg;
|
||||||
struct ceph_pagelist *pagelist = recon_state->pagelist;
|
struct ceph_pagelist *pagelist = recon_state->pagelist;
|
||||||
int err;
|
struct dentry *dentry;
|
||||||
|
char *path;
|
||||||
|
int pathlen, err;
|
||||||
|
u64 pathbase;
|
||||||
u64 snap_follows;
|
u64 snap_follows;
|
||||||
|
|
||||||
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
|
dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
|
||||||
inode, ceph_vinop(inode), cap, cap->cap_id,
|
inode, ceph_vinop(inode), cap, cap->cap_id,
|
||||||
ceph_cap_string(cap->issued));
|
ceph_cap_string(cap->issued));
|
||||||
|
|
||||||
|
dentry = d_find_primary(inode);
|
||||||
|
if (dentry) {
|
||||||
|
/* set pathbase to parent dir when msg_version >= 2 */
|
||||||
|
path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
|
||||||
|
recon_state->msg_version >= 2);
|
||||||
|
dput(dentry);
|
||||||
|
if (IS_ERR(path)) {
|
||||||
|
err = PTR_ERR(path);
|
||||||
|
goto out_err;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
path = NULL;
|
||||||
|
pathlen = 0;
|
||||||
|
pathbase = 0;
|
||||||
|
}
|
||||||
|
|
||||||
spin_lock(&ci->i_ceph_lock);
|
spin_lock(&ci->i_ceph_lock);
|
||||||
cap->seq = 0; /* reset cap seq */
|
cap->seq = 0; /* reset cap seq */
|
||||||
cap->issue_seq = 0; /* and issue_seq */
|
cap->issue_seq = 0; /* and issue_seq */
|
||||||
@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||||||
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
|
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
|
||||||
rec.v2.issued = cpu_to_le32(cap->issued);
|
rec.v2.issued = cpu_to_le32(cap->issued);
|
||||||
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
|
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
|
||||||
rec.v2.pathbase = 0;
|
rec.v2.pathbase = cpu_to_le64(pathbase);
|
||||||
rec.v2.flock_len = (__force __le32)
|
rec.v2.flock_len = (__force __le32)
|
||||||
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
|
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
|
||||||
} else {
|
} else {
|
||||||
@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
|||||||
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
|
ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
|
||||||
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
|
ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
|
||||||
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
|
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
|
||||||
rec.v1.pathbase = 0;
|
rec.v1.pathbase = cpu_to_le64(pathbase);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (list_empty(&ci->i_cap_snaps)) {
|
if (list_empty(&ci->i_cap_snaps)) {
|
||||||
@ -3725,7 +3781,7 @@ encode_again:
|
|||||||
sizeof(struct ceph_filelock);
|
sizeof(struct ceph_filelock);
|
||||||
rec.v2.flock_len = cpu_to_le32(struct_len);
|
rec.v2.flock_len = cpu_to_le32(struct_len);
|
||||||
|
|
||||||
struct_len += sizeof(u32) + sizeof(rec.v2);
|
struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
|
||||||
|
|
||||||
if (struct_v >= 2)
|
if (struct_v >= 2)
|
||||||
struct_len += sizeof(u64); /* snap_follows */
|
struct_len += sizeof(u64); /* snap_follows */
|
||||||
@ -3749,7 +3805,7 @@ encode_again:
|
|||||||
ceph_pagelist_encode_8(pagelist, 1);
|
ceph_pagelist_encode_8(pagelist, 1);
|
||||||
ceph_pagelist_encode_32(pagelist, struct_len);
|
ceph_pagelist_encode_32(pagelist, struct_len);
|
||||||
}
|
}
|
||||||
ceph_pagelist_encode_string(pagelist, NULL, 0);
|
ceph_pagelist_encode_string(pagelist, path, pathlen);
|
||||||
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
|
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
|
||||||
ceph_locks_to_pagelist(flocks, pagelist,
|
ceph_locks_to_pagelist(flocks, pagelist,
|
||||||
num_fcntl_locks, num_flock_locks);
|
num_fcntl_locks, num_flock_locks);
|
||||||
@ -3758,39 +3814,20 @@ encode_again:
|
|||||||
out_freeflocks:
|
out_freeflocks:
|
||||||
kfree(flocks);
|
kfree(flocks);
|
||||||
} else {
|
} else {
|
||||||
u64 pathbase = 0;
|
|
||||||
int pathlen = 0;
|
|
||||||
char *path = NULL;
|
|
||||||
struct dentry *dentry;
|
|
||||||
|
|
||||||
dentry = d_find_alias(inode);
|
|
||||||
if (dentry) {
|
|
||||||
path = ceph_mdsc_build_path(dentry,
|
|
||||||
&pathlen, &pathbase, 0);
|
|
||||||
dput(dentry);
|
|
||||||
if (IS_ERR(path)) {
|
|
||||||
err = PTR_ERR(path);
|
|
||||||
goto out_err;
|
|
||||||
}
|
|
||||||
rec.v1.pathbase = cpu_to_le64(pathbase);
|
|
||||||
}
|
|
||||||
|
|
||||||
err = ceph_pagelist_reserve(pagelist,
|
err = ceph_pagelist_reserve(pagelist,
|
||||||
sizeof(u64) + sizeof(u32) +
|
sizeof(u64) + sizeof(u32) +
|
||||||
pathlen + sizeof(rec.v1));
|
pathlen + sizeof(rec.v1));
|
||||||
if (err) {
|
if (err)
|
||||||
goto out_freepath;
|
goto out_err;
|
||||||
}
|
|
||||||
|
|
||||||
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
|
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
|
||||||
ceph_pagelist_encode_string(pagelist, path, pathlen);
|
ceph_pagelist_encode_string(pagelist, path, pathlen);
|
||||||
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
|
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
|
||||||
out_freepath:
|
|
||||||
ceph_mdsc_free_path(path, pathlen);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out_err:
|
out_err:
|
||||||
if (err >= 0)
|
ceph_mdsc_free_path(path, pathlen);
|
||||||
|
if (!err)
|
||||||
recon_state->nr_caps++;
|
recon_state->nr_caps++;
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
|
|||||||
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
|
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!READ_ONCE(fsc->blacklisted))
|
if (!READ_ONCE(fsc->blocklisted))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (fsc->last_auto_reconnect &&
|
if (fsc->last_auto_reconnect &&
|
||||||
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
|
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
pr_info("auto reconnect after blacklisted\n");
|
pr_info("auto reconnect after blocklisted\n");
|
||||||
fsc->last_auto_reconnect = jiffies;
|
fsc->last_auto_reconnect = jiffies;
|
||||||
ceph_force_reconnect(fsc->sb);
|
ceph_force_reconnect(fsc->sb);
|
||||||
}
|
}
|
||||||
|
@ -393,7 +393,7 @@ struct ceph_mds_client {
|
|||||||
|
|
||||||
struct ceph_mds_session **sessions; /* NULL for mds if no session */
|
struct ceph_mds_session **sessions; /* NULL for mds if no session */
|
||||||
atomic_t num_sessions;
|
atomic_t num_sessions;
|
||||||
int max_sessions; /* len of s_mds_sessions */
|
int max_sessions; /* len of sessions array */
|
||||||
int stopping; /* true if shutting down */
|
int stopping; /* true if shutting down */
|
||||||
|
|
||||||
atomic64_t quotarealms_count; /* # realms with quota */
|
atomic64_t quotarealms_count; /* # realms with quota */
|
||||||
|
@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m)
|
|||||||
m->total_metadatas = 0;
|
m->total_metadatas = 0;
|
||||||
m->metadata_latency_sum = 0;
|
m->metadata_latency_sum = 0;
|
||||||
|
|
||||||
|
atomic64_set(&m->opened_files, 0);
|
||||||
|
ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
|
||||||
|
if (ret)
|
||||||
|
goto err_opened_inodes;
|
||||||
|
ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
|
||||||
|
if (ret)
|
||||||
|
goto err_total_inodes;
|
||||||
|
|
||||||
m->session = NULL;
|
m->session = NULL;
|
||||||
INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
|
INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
err_total_inodes:
|
||||||
|
percpu_counter_destroy(&m->opened_inodes);
|
||||||
|
err_opened_inodes:
|
||||||
|
percpu_counter_destroy(&m->i_caps_mis);
|
||||||
err_i_caps_mis:
|
err_i_caps_mis:
|
||||||
percpu_counter_destroy(&m->i_caps_hit);
|
percpu_counter_destroy(&m->i_caps_hit);
|
||||||
err_i_caps_hit:
|
err_i_caps_hit:
|
||||||
@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
|
|||||||
if (!m)
|
if (!m)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
percpu_counter_destroy(&m->total_inodes);
|
||||||
|
percpu_counter_destroy(&m->opened_inodes);
|
||||||
percpu_counter_destroy(&m->i_caps_mis);
|
percpu_counter_destroy(&m->i_caps_mis);
|
||||||
percpu_counter_destroy(&m->i_caps_hit);
|
percpu_counter_destroy(&m->i_caps_hit);
|
||||||
percpu_counter_destroy(&m->d_lease_mis);
|
percpu_counter_destroy(&m->d_lease_mis);
|
||||||
|
@ -115,6 +115,13 @@ struct ceph_client_metric {
|
|||||||
ktime_t metadata_latency_min;
|
ktime_t metadata_latency_min;
|
||||||
ktime_t metadata_latency_max;
|
ktime_t metadata_latency_max;
|
||||||
|
|
||||||
|
/* The total number of directories and files that are opened */
|
||||||
|
atomic64_t opened_files;
|
||||||
|
|
||||||
|
/* The total number of inodes that have opened files or directories */
|
||||||
|
struct percpu_counter opened_inodes;
|
||||||
|
struct percpu_counter total_inodes;
|
||||||
|
|
||||||
struct ceph_mds_session *session;
|
struct ceph_mds_session *session;
|
||||||
struct delayed_work delayed_work; /* delayed work */
|
struct delayed_work delayed_work; /* delayed work */
|
||||||
};
|
};
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
|
|
||||||
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
|
void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
if (inc)
|
if (inc)
|
||||||
atomic64_inc(&mdsc->quotarealms_count);
|
atomic64_inc(&mdsc->quotarealms_count);
|
||||||
else
|
else
|
||||||
@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
|
|||||||
|
|
||||||
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
|
static inline bool ceph_has_realms_with_quotas(struct inode *inode)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct super_block *sb = inode->i_sb;
|
||||||
struct super_block *sb = mdsc->fsc->sb;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
|
||||||
struct inode *root = d_inode(sb->s_root);
|
struct inode *root = d_inode(sb->s_root);
|
||||||
|
|
||||||
if (atomic64_read(&mdsc->quotarealms_count) > 0)
|
if (atomic64_read(&mdsc->quotarealms_count) > 0)
|
||||||
@ -266,7 +266,7 @@ restart:
|
|||||||
|
|
||||||
static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
|
static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
|
||||||
struct ceph_snap_realm *old_realm, *new_realm;
|
struct ceph_snap_realm *old_realm, *new_realm;
|
||||||
bool is_same;
|
bool is_same;
|
||||||
|
|
||||||
@ -313,7 +313,7 @@ enum quota_check_op {
|
|||||||
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
|
static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
|
||||||
loff_t delta)
|
loff_t delta)
|
||||||
{
|
{
|
||||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
struct ceph_inode_info *ci;
|
struct ceph_inode_info *ci;
|
||||||
struct ceph_snap_realm *realm, *next;
|
struct ceph_snap_realm *realm, *next;
|
||||||
struct inode *in;
|
struct inode *in;
|
||||||
|
@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
|
|||||||
struct ceph_cap_snap *capsnap)
|
struct ceph_cap_snap *capsnap)
|
||||||
{
|
{
|
||||||
struct inode *inode = &ci->vfs_inode;
|
struct inode *inode = &ci->vfs_inode;
|
||||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
|
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
|
||||||
|
|
||||||
BUG_ON(capsnap->writing);
|
BUG_ON(capsnap->writing);
|
||||||
capsnap->size = inode->i_size;
|
capsnap->size = inode->i_size;
|
||||||
|
@ -1205,14 +1205,13 @@ nomem:
|
|||||||
static void ceph_kill_sb(struct super_block *s)
|
static void ceph_kill_sb(struct super_block *s)
|
||||||
{
|
{
|
||||||
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
|
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
|
||||||
dev_t dev = s->s_dev;
|
|
||||||
|
|
||||||
dout("kill_sb %p\n", s);
|
dout("kill_sb %p\n", s);
|
||||||
|
|
||||||
ceph_mdsc_pre_umount(fsc->mdsc);
|
ceph_mdsc_pre_umount(fsc->mdsc);
|
||||||
flush_fs_workqueues(fsc);
|
flush_fs_workqueues(fsc);
|
||||||
|
|
||||||
generic_shutdown_super(s);
|
kill_anon_super(s);
|
||||||
|
|
||||||
fsc->client->extra_mon_dispatch = NULL;
|
fsc->client->extra_mon_dispatch = NULL;
|
||||||
ceph_fs_debugfs_cleanup(fsc);
|
ceph_fs_debugfs_cleanup(fsc);
|
||||||
@ -1220,7 +1219,6 @@ static void ceph_kill_sb(struct super_block *s)
|
|||||||
ceph_fscache_unregister_fs(fsc);
|
ceph_fscache_unregister_fs(fsc);
|
||||||
|
|
||||||
destroy_fs_client(fsc);
|
destroy_fs_client(fsc);
|
||||||
free_anon_bdev(dev);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct file_system_type ceph_fs_type = {
|
static struct file_system_type ceph_fs_type = {
|
||||||
@ -1243,13 +1241,13 @@ int ceph_force_reconnect(struct super_block *sb)
|
|||||||
* see remove_session_caps_cb() */
|
* see remove_session_caps_cb() */
|
||||||
flush_workqueue(fsc->inode_wq);
|
flush_workqueue(fsc->inode_wq);
|
||||||
|
|
||||||
/* In case that we were blacklisted. This also reset
|
/* In case that we were blocklisted. This also reset
|
||||||
* all mon/osd connections */
|
* all mon/osd connections */
|
||||||
ceph_reset_client_addr(fsc->client);
|
ceph_reset_client_addr(fsc->client);
|
||||||
|
|
||||||
ceph_osdc_clear_abort_err(&fsc->client->osdc);
|
ceph_osdc_clear_abort_err(&fsc->client->osdc);
|
||||||
|
|
||||||
fsc->blacklisted = false;
|
fsc->blocklisted = false;
|
||||||
fsc->mount_state = CEPH_MOUNT_MOUNTED;
|
fsc->mount_state = CEPH_MOUNT_MOUNTED;
|
||||||
|
|
||||||
if (sb->s_root) {
|
if (sb->s_root) {
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
|
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
|
||||||
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
|
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
|
||||||
|
|
||||||
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
|
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */
|
||||||
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
|
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
|
||||||
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
|
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
|
||||||
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
|
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
|
||||||
@ -109,7 +109,7 @@ struct ceph_fs_client {
|
|||||||
unsigned long mount_state;
|
unsigned long mount_state;
|
||||||
|
|
||||||
unsigned long last_auto_reconnect;
|
unsigned long last_auto_reconnect;
|
||||||
bool blacklisted;
|
bool blocklisted;
|
||||||
|
|
||||||
bool have_copy_from2;
|
bool have_copy_from2;
|
||||||
|
|
||||||
@ -160,7 +160,8 @@ struct ceph_cap {
|
|||||||
int issued; /* latest, from the mds */
|
int issued; /* latest, from the mds */
|
||||||
int implemented; /* implemented superset of
|
int implemented; /* implemented superset of
|
||||||
issued (for revocation) */
|
issued (for revocation) */
|
||||||
int mds, mds_wanted;
|
int mds; /* mds index for this cap */
|
||||||
|
int mds_wanted; /* caps wanted from this mds */
|
||||||
};
|
};
|
||||||
/* caps to release */
|
/* caps to release */
|
||||||
struct {
|
struct {
|
||||||
@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb)
|
|||||||
return (struct ceph_fs_client *)sb->s_fs_info;
|
return (struct ceph_fs_client *)sb->s_fs_info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline struct ceph_mds_client *
|
||||||
|
ceph_sb_to_mdsc(const struct super_block *sb)
|
||||||
|
{
|
||||||
|
return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
|
||||||
|
}
|
||||||
|
|
||||||
static inline struct ceph_vino
|
static inline struct ceph_vino
|
||||||
ceph_vino(const struct inode *inode)
|
ceph_vino(const struct inode *inode)
|
||||||
{
|
{
|
||||||
|
@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
|
|||||||
* NULL terminates however, so call it on a temporary buffer and then memcpy
|
* NULL terminates however, so call it on a temporary buffer and then memcpy
|
||||||
* the result into place.
|
* the result into place.
|
||||||
*/
|
*/
|
||||||
static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
|
static __printf(3, 4)
|
||||||
|
int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
va_list args;
|
va_list args;
|
||||||
|
@ -54,7 +54,7 @@ struct ceph_connection_operations {
|
|||||||
int (*check_message_signature) (struct ceph_msg *msg);
|
int (*check_message_signature) (struct ceph_msg *msg);
|
||||||
};
|
};
|
||||||
|
|
||||||
/* use format string %s%d */
|
/* use format string %s%lld */
|
||||||
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
|
#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
|
||||||
|
|
||||||
struct ceph_messenger {
|
struct ceph_messenger {
|
||||||
|
@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
|
|||||||
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
|
int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
|
||||||
ceph_monc_callback_t cb, u64 private_data);
|
ceph_monc_callback_t cb, u64 private_data);
|
||||||
|
|
||||||
int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
|
int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
|
||||||
struct ceph_entity_addr *client_addr);
|
struct ceph_entity_addr *client_addr);
|
||||||
|
|
||||||
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
|
extern int ceph_monc_open_session(struct ceph_mon_client *monc);
|
||||||
|
@ -137,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
|
|||||||
const char *fmt, ...);
|
const char *fmt, ...);
|
||||||
void ceph_oid_destroy(struct ceph_object_id *oid);
|
void ceph_oid_destroy(struct ceph_object_id *oid);
|
||||||
|
|
||||||
|
struct workspace_manager {
|
||||||
|
struct list_head idle_ws;
|
||||||
|
spinlock_t ws_lock;
|
||||||
|
/* Number of free workspaces */
|
||||||
|
int free_ws;
|
||||||
|
/* Total number of allocated workspaces */
|
||||||
|
atomic_t total_ws;
|
||||||
|
/* Waiters for a free workspace */
|
||||||
|
wait_queue_head_t ws_wait;
|
||||||
|
};
|
||||||
|
|
||||||
struct ceph_pg_mapping {
|
struct ceph_pg_mapping {
|
||||||
struct rb_node node;
|
struct rb_node node;
|
||||||
struct ceph_pg pgid;
|
struct ceph_pg pgid;
|
||||||
@ -184,8 +195,7 @@ struct ceph_osdmap {
|
|||||||
* the list of osds that store+replicate them. */
|
* the list of osds that store+replicate them. */
|
||||||
struct crush_map *crush;
|
struct crush_map *crush;
|
||||||
|
|
||||||
struct mutex crush_workspace_mutex;
|
struct workspace_manager crush_wsm;
|
||||||
void *crush_workspace;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
|
static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
|
||||||
|
@ -424,7 +424,7 @@ enum {
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
|
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
|
||||||
#define EBLACKLISTED ESHUTDOWN /* blacklisted */
|
#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
|
||||||
|
|
||||||
/* xattr comparison */
|
/* xattr comparison */
|
||||||
enum {
|
enum {
|
||||||
|
@ -346,6 +346,9 @@ struct crush_work_bucket {
|
|||||||
|
|
||||||
struct crush_work {
|
struct crush_work {
|
||||||
struct crush_work_bucket **work; /* Per-bucket working store */
|
struct crush_work_bucket **work; /* Per-bucket working store */
|
||||||
|
#ifdef __KERNEL__
|
||||||
|
struct list_head item;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef __KERNEL__
|
#ifdef __KERNEL__
|
||||||
|
@ -2016,11 +2016,11 @@ static int process_banner(struct ceph_connection *con)
|
|||||||
sizeof(con->peer_addr)) != 0 &&
|
sizeof(con->peer_addr)) != 0 &&
|
||||||
!(addr_is_blank(&con->actual_peer_addr) &&
|
!(addr_is_blank(&con->actual_peer_addr) &&
|
||||||
con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
|
con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
|
||||||
pr_warn("wrong peer, want %s/%d, got %s/%d\n",
|
pr_warn("wrong peer, want %s/%u, got %s/%u\n",
|
||||||
ceph_pr_addr(&con->peer_addr),
|
ceph_pr_addr(&con->peer_addr),
|
||||||
(int)le32_to_cpu(con->peer_addr.nonce),
|
le32_to_cpu(con->peer_addr.nonce),
|
||||||
ceph_pr_addr(&con->actual_peer_addr),
|
ceph_pr_addr(&con->actual_peer_addr),
|
||||||
(int)le32_to_cpu(con->actual_peer_addr.nonce));
|
le32_to_cpu(con->actual_peer_addr.nonce));
|
||||||
con->error_msg = "wrong peer at address";
|
con->error_msg = "wrong peer at address";
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -2811,13 +2811,13 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
|
|||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dout("%s %p %lu\n", __func__, con, delay);
|
||||||
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
|
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
|
||||||
dout("%s %p - already queued\n", __func__, con);
|
dout("%s %p - already queued\n", __func__, con);
|
||||||
con->ops->put(con);
|
con->ops->put(con);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
|
||||||
dout("%s %p %lu\n", __func__, con, delay);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2998,6 +2998,11 @@ static void con_fault(struct ceph_connection *con)
|
|||||||
ceph_msg_put(con->in_msg);
|
ceph_msg_put(con->in_msg);
|
||||||
con->in_msg = NULL;
|
con->in_msg = NULL;
|
||||||
}
|
}
|
||||||
|
if (con->out_msg) {
|
||||||
|
BUG_ON(con->out_msg->con != con);
|
||||||
|
ceph_msg_put(con->out_msg);
|
||||||
|
con->out_msg = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* Requeue anything that hasn't been acked */
|
/* Requeue anything that hasn't been acked */
|
||||||
list_splice_init(&con->out_sent, &con->out_queue);
|
list_splice_init(&con->out_sent, &con->out_queue);
|
||||||
|
@ -896,8 +896,9 @@ bad:
|
|||||||
ceph_msg_dump(msg);
|
ceph_msg_dump(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
|
static __printf(2, 0)
|
||||||
struct ceph_entity_addr *client_addr)
|
int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
|
||||||
|
va_list ap)
|
||||||
{
|
{
|
||||||
struct ceph_mon_generic_request *req;
|
struct ceph_mon_generic_request *req;
|
||||||
struct ceph_mon_command *h;
|
struct ceph_mon_command *h;
|
||||||
@ -925,29 +926,65 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
|
|||||||
h->monhdr.session_mon_tid = 0;
|
h->monhdr.session_mon_tid = 0;
|
||||||
h->fsid = monc->monmap->fsid;
|
h->fsid = monc->monmap->fsid;
|
||||||
h->num_strs = cpu_to_le32(1);
|
h->num_strs = cpu_to_le32(1);
|
||||||
len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
|
len = vsprintf(h->str, fmt, ap);
|
||||||
\"blacklistop\": \"add\", \
|
|
||||||
\"addr\": \"%pISpc/%u\" }",
|
|
||||||
&client_addr->in_addr, le32_to_cpu(client_addr->nonce));
|
|
||||||
h->str_len = cpu_to_le32(len);
|
h->str_len = cpu_to_le32(len);
|
||||||
send_generic_request(monc, req);
|
send_generic_request(monc, req);
|
||||||
mutex_unlock(&monc->mutex);
|
mutex_unlock(&monc->mutex);
|
||||||
|
|
||||||
ret = wait_generic_request(req);
|
ret = wait_generic_request(req);
|
||||||
if (!ret)
|
|
||||||
/*
|
|
||||||
* Make sure we have the osdmap that includes the blacklist
|
|
||||||
* entry. This is needed to ensure that the OSDs pick up the
|
|
||||||
* new blacklist before processing any future requests from
|
|
||||||
* this client.
|
|
||||||
*/
|
|
||||||
ret = ceph_wait_for_latest_osdmap(monc->client, 0);
|
|
||||||
|
|
||||||
out:
|
out:
|
||||||
put_generic_request(req);
|
put_generic_request(req);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(ceph_monc_blacklist_add);
|
|
||||||
|
static __printf(2, 3)
|
||||||
|
int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
|
||||||
|
{
|
||||||
|
va_list ap;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
va_start(ap, fmt);
|
||||||
|
ret = do_mon_command_vargs(monc, fmt, ap);
|
||||||
|
va_end(ap);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
|
||||||
|
struct ceph_entity_addr *client_addr)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
ret = do_mon_command(monc,
|
||||||
|
"{ \"prefix\": \"osd blocklist\", \
|
||||||
|
\"blocklistop\": \"add\", \
|
||||||
|
\"addr\": \"%pISpc/%u\" }",
|
||||||
|
&client_addr->in_addr,
|
||||||
|
le32_to_cpu(client_addr->nonce));
|
||||||
|
if (ret == -EINVAL) {
|
||||||
|
/*
|
||||||
|
* The monitor returns EINVAL on an unrecognized command.
|
||||||
|
* Try the legacy command -- it is exactly the same except
|
||||||
|
* for the name.
|
||||||
|
*/
|
||||||
|
ret = do_mon_command(monc,
|
||||||
|
"{ \"prefix\": \"osd blacklist\", \
|
||||||
|
\"blacklistop\": \"add\", \
|
||||||
|
\"addr\": \"%pISpc/%u\" }",
|
||||||
|
&client_addr->in_addr,
|
||||||
|
le32_to_cpu(client_addr->nonce));
|
||||||
|
}
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure we have the osdmap that includes the blocklist
|
||||||
|
* entry. This is needed to ensure that the OSDs pick up the
|
||||||
|
* new blocklist before processing any future requests from
|
||||||
|
* this client.
|
||||||
|
*/
|
||||||
|
return ceph_wait_for_latest_osdmap(monc->client, 0);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(ceph_monc_blocklist_add);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Resend pending generic requests.
|
* Resend pending generic requests.
|
||||||
|
@ -964,6 +964,143 @@ bad:
|
|||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CRUSH workspaces
|
||||||
|
*
|
||||||
|
* workspace_manager framework borrowed from fs/btrfs/compression.c.
|
||||||
|
* Two simplifications: there is only one type of workspace and there
|
||||||
|
* is always at least one workspace.
|
||||||
|
*/
|
||||||
|
static struct crush_work *alloc_workspace(const struct crush_map *c)
|
||||||
|
{
|
||||||
|
struct crush_work *work;
|
||||||
|
size_t work_size;
|
||||||
|
|
||||||
|
WARN_ON(!c->working_size);
|
||||||
|
work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
|
||||||
|
dout("%s work_size %zu bytes\n", __func__, work_size);
|
||||||
|
|
||||||
|
work = ceph_kvmalloc(work_size, GFP_NOIO);
|
||||||
|
if (!work)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
INIT_LIST_HEAD(&work->item);
|
||||||
|
crush_init_workspace(c, work);
|
||||||
|
return work;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void free_workspace(struct crush_work *work)
|
||||||
|
{
|
||||||
|
WARN_ON(!list_empty(&work->item));
|
||||||
|
kvfree(work);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void init_workspace_manager(struct workspace_manager *wsm)
|
||||||
|
{
|
||||||
|
INIT_LIST_HEAD(&wsm->idle_ws);
|
||||||
|
spin_lock_init(&wsm->ws_lock);
|
||||||
|
atomic_set(&wsm->total_ws, 0);
|
||||||
|
wsm->free_ws = 0;
|
||||||
|
init_waitqueue_head(&wsm->ws_wait);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void add_initial_workspace(struct workspace_manager *wsm,
|
||||||
|
struct crush_work *work)
|
||||||
|
{
|
||||||
|
WARN_ON(!list_empty(&wsm->idle_ws));
|
||||||
|
|
||||||
|
list_add(&work->item, &wsm->idle_ws);
|
||||||
|
atomic_set(&wsm->total_ws, 1);
|
||||||
|
wsm->free_ws = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cleanup_workspace_manager(struct workspace_manager *wsm)
|
||||||
|
{
|
||||||
|
struct crush_work *work;
|
||||||
|
|
||||||
|
while (!list_empty(&wsm->idle_ws)) {
|
||||||
|
work = list_first_entry(&wsm->idle_ws, struct crush_work,
|
||||||
|
item);
|
||||||
|
list_del_init(&work->item);
|
||||||
|
free_workspace(work);
|
||||||
|
}
|
||||||
|
atomic_set(&wsm->total_ws, 0);
|
||||||
|
wsm->free_ws = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Finds an available workspace or allocates a new one. If it's not
|
||||||
|
* possible to allocate a new one, waits until there is one.
|
||||||
|
*/
|
||||||
|
static struct crush_work *get_workspace(struct workspace_manager *wsm,
|
||||||
|
const struct crush_map *c)
|
||||||
|
{
|
||||||
|
struct crush_work *work;
|
||||||
|
int cpus = num_online_cpus();
|
||||||
|
|
||||||
|
again:
|
||||||
|
spin_lock(&wsm->ws_lock);
|
||||||
|
if (!list_empty(&wsm->idle_ws)) {
|
||||||
|
work = list_first_entry(&wsm->idle_ws, struct crush_work,
|
||||||
|
item);
|
||||||
|
list_del_init(&work->item);
|
||||||
|
wsm->free_ws--;
|
||||||
|
spin_unlock(&wsm->ws_lock);
|
||||||
|
return work;
|
||||||
|
|
||||||
|
}
|
||||||
|
if (atomic_read(&wsm->total_ws) > cpus) {
|
||||||
|
DEFINE_WAIT(wait);
|
||||||
|
|
||||||
|
spin_unlock(&wsm->ws_lock);
|
||||||
|
prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
|
||||||
|
if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
|
||||||
|
schedule();
|
||||||
|
finish_wait(&wsm->ws_wait, &wait);
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
atomic_inc(&wsm->total_ws);
|
||||||
|
spin_unlock(&wsm->ws_lock);
|
||||||
|
|
||||||
|
work = alloc_workspace(c);
|
||||||
|
if (!work) {
|
||||||
|
atomic_dec(&wsm->total_ws);
|
||||||
|
wake_up(&wsm->ws_wait);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Do not return the error but go back to waiting. We
|
||||||
|
* have the inital workspace and the CRUSH computation
|
||||||
|
* time is bounded so we will get it eventually.
|
||||||
|
*/
|
||||||
|
WARN_ON(atomic_read(&wsm->total_ws) < 1);
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
return work;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Puts a workspace back on the list or frees it if we have enough
|
||||||
|
* idle ones sitting around.
|
||||||
|
*/
|
||||||
|
static void put_workspace(struct workspace_manager *wsm,
|
||||||
|
struct crush_work *work)
|
||||||
|
{
|
||||||
|
spin_lock(&wsm->ws_lock);
|
||||||
|
if (wsm->free_ws <= num_online_cpus()) {
|
||||||
|
list_add(&work->item, &wsm->idle_ws);
|
||||||
|
wsm->free_ws++;
|
||||||
|
spin_unlock(&wsm->ws_lock);
|
||||||
|
goto wake;
|
||||||
|
}
|
||||||
|
spin_unlock(&wsm->ws_lock);
|
||||||
|
|
||||||
|
free_workspace(work);
|
||||||
|
atomic_dec(&wsm->total_ws);
|
||||||
|
wake:
|
||||||
|
if (wq_has_sleeper(&wsm->ws_wait))
|
||||||
|
wake_up(&wsm->ws_wait);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* osd map
|
* osd map
|
||||||
*/
|
*/
|
||||||
@ -981,7 +1118,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
|
|||||||
map->primary_temp = RB_ROOT;
|
map->primary_temp = RB_ROOT;
|
||||||
map->pg_upmap = RB_ROOT;
|
map->pg_upmap = RB_ROOT;
|
||||||
map->pg_upmap_items = RB_ROOT;
|
map->pg_upmap_items = RB_ROOT;
|
||||||
mutex_init(&map->crush_workspace_mutex);
|
|
||||||
|
init_workspace_manager(&map->crush_wsm);
|
||||||
|
|
||||||
return map;
|
return map;
|
||||||
}
|
}
|
||||||
@ -989,8 +1127,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
|
|||||||
void ceph_osdmap_destroy(struct ceph_osdmap *map)
|
void ceph_osdmap_destroy(struct ceph_osdmap *map)
|
||||||
{
|
{
|
||||||
dout("osdmap_destroy %p\n", map);
|
dout("osdmap_destroy %p\n", map);
|
||||||
|
|
||||||
if (map->crush)
|
if (map->crush)
|
||||||
crush_destroy(map->crush);
|
crush_destroy(map->crush);
|
||||||
|
cleanup_workspace_manager(&map->crush_wsm);
|
||||||
|
|
||||||
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
|
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
|
||||||
struct ceph_pg_mapping *pg =
|
struct ceph_pg_mapping *pg =
|
||||||
rb_entry(rb_first(&map->pg_temp),
|
rb_entry(rb_first(&map->pg_temp),
|
||||||
@ -1029,7 +1170,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
|
|||||||
kvfree(map->osd_weight);
|
kvfree(map->osd_weight);
|
||||||
kvfree(map->osd_addr);
|
kvfree(map->osd_addr);
|
||||||
kvfree(map->osd_primary_affinity);
|
kvfree(map->osd_primary_affinity);
|
||||||
kvfree(map->crush_workspace);
|
|
||||||
kfree(map);
|
kfree(map);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1104,26 +1244,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
|
|||||||
|
|
||||||
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
|
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
|
||||||
{
|
{
|
||||||
void *workspace;
|
struct crush_work *work;
|
||||||
size_t work_size;
|
|
||||||
|
|
||||||
if (IS_ERR(crush))
|
if (IS_ERR(crush))
|
||||||
return PTR_ERR(crush);
|
return PTR_ERR(crush);
|
||||||
|
|
||||||
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
|
work = alloc_workspace(crush);
|
||||||
dout("%s work_size %zu bytes\n", __func__, work_size);
|
if (!work) {
|
||||||
workspace = ceph_kvmalloc(work_size, GFP_NOIO);
|
|
||||||
if (!workspace) {
|
|
||||||
crush_destroy(crush);
|
crush_destroy(crush);
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
crush_init_workspace(crush, workspace);
|
|
||||||
|
|
||||||
if (map->crush)
|
if (map->crush)
|
||||||
crush_destroy(map->crush);
|
crush_destroy(map->crush);
|
||||||
kvfree(map->crush_workspace);
|
cleanup_workspace_manager(&map->crush_wsm);
|
||||||
map->crush = crush;
|
map->crush = crush;
|
||||||
map->crush_workspace = workspace;
|
add_initial_workspace(&map->crush_wsm, work);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2322,6 +2458,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
|
|||||||
s64 choose_args_index)
|
s64 choose_args_index)
|
||||||
{
|
{
|
||||||
struct crush_choose_arg_map *arg_map;
|
struct crush_choose_arg_map *arg_map;
|
||||||
|
struct crush_work *work;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
|
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
|
||||||
@ -2332,12 +2469,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
|
|||||||
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
|
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
|
||||||
CEPH_DEFAULT_CHOOSE_ARGS);
|
CEPH_DEFAULT_CHOOSE_ARGS);
|
||||||
|
|
||||||
mutex_lock(&map->crush_workspace_mutex);
|
work = get_workspace(&map->crush_wsm, map->crush);
|
||||||
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
|
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
|
||||||
weight, weight_max, map->crush_workspace,
|
weight, weight_max, work,
|
||||||
arg_map ? arg_map->args : NULL);
|
arg_map ? arg_map->args : NULL);
|
||||||
mutex_unlock(&map->crush_workspace_mutex);
|
put_workspace(&map->crush_wsm, work);
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user