mirror of
https://github.com/edk2-porting/linux-next.git
synced 2025-01-09 14:14:00 +08:00
xprtrdma: Reduce context switching due to Local Invalidation
Since commit ba69cd122e
("xprtrdma: Remove support for FMR memory
registration"), FRWR is the only supported memory registration mode.
We can take advantage of the asynchronous nature of FRWR's LOCAL_INV
Work Requests to get rid of the completion wait by having the
LOCAL_INV completion handler take care of DMA unmapping MRs and
waking the upper layer RPC waiter.
This eliminates two context switches when local invalidation is
necessary. As a side benefit, we will no longer need the per-xprt
deferred completion work queue.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
This commit is contained in:
parent
40088f0e9b
commit
d8099feda4
@ -712,6 +712,7 @@ TRACE_EVENT(xprtrdma_wc_receive,
|
|||||||
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
|
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
|
||||||
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
|
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
|
||||||
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
|
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
|
||||||
|
DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_done);
|
||||||
|
|
||||||
TRACE_EVENT(xprtrdma_frwr_alloc,
|
TRACE_EVENT(xprtrdma_frwr_alloc,
|
||||||
TP_PROTO(
|
TP_PROTO(
|
||||||
|
@ -542,7 +542,10 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
|
|||||||
* @req: rpcrdma_req with a non-empty list of MRs to process
|
* @req: rpcrdma_req with a non-empty list of MRs to process
|
||||||
*
|
*
|
||||||
* Sleeps until it is safe for the host CPU to access the previously mapped
|
* Sleeps until it is safe for the host CPU to access the previously mapped
|
||||||
* memory regions.
|
* memory regions. This guarantees that registered MRs are properly fenced
|
||||||
|
* from the server before the RPC consumer accesses the data in them. It
|
||||||
|
* also ensures proper Send flow control: waking the next RPC waits until
|
||||||
|
* this RPC has relinquished all its Send Queue entries.
|
||||||
*/
|
*/
|
||||||
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
||||||
{
|
{
|
||||||
@ -616,3 +619,101 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
|||||||
rpcrdma_mr_recycle(mr);
|
rpcrdma_mr_recycle(mr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
|
||||||
|
* @cq: completion queue (ignored)
|
||||||
|
* @wc: completed WR
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
|
||||||
|
{
|
||||||
|
struct ib_cqe *cqe = wc->wr_cqe;
|
||||||
|
struct rpcrdma_frwr *frwr =
|
||||||
|
container_of(cqe, struct rpcrdma_frwr, fr_cqe);
|
||||||
|
struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
|
||||||
|
|
||||||
|
/* WARNING: Only wr_cqe and status are reliable at this point */
|
||||||
|
trace_xprtrdma_wc_li_done(wc, frwr);
|
||||||
|
rpcrdma_complete_rqst(frwr->fr_req->rl_reply);
|
||||||
|
__frwr_release_mr(wc, mr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* frwr_unmap_async - invalidate memory regions that were registered for @req
|
||||||
|
* @r_xprt: controlling transport instance
|
||||||
|
* @req: rpcrdma_req with a non-empty list of MRs to process
|
||||||
|
*
|
||||||
|
* This guarantees that registered MRs are properly fenced from the
|
||||||
|
* server before the RPC consumer accesses the data in them. It also
|
||||||
|
* ensures proper Send flow control: waking the next RPC waits until
|
||||||
|
* this RPC has relinquished all its Send Queue entries.
|
||||||
|
*/
|
||||||
|
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
||||||
|
{
|
||||||
|
struct ib_send_wr *first, *last, **prev;
|
||||||
|
const struct ib_send_wr *bad_wr;
|
||||||
|
struct rpcrdma_frwr *frwr;
|
||||||
|
struct rpcrdma_mr *mr;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
/* Chain the LOCAL_INV Work Requests and post them with
|
||||||
|
* a single ib_post_send() call.
|
||||||
|
*/
|
||||||
|
frwr = NULL;
|
||||||
|
prev = &first;
|
||||||
|
while (!list_empty(&req->rl_registered)) {
|
||||||
|
mr = rpcrdma_mr_pop(&req->rl_registered);
|
||||||
|
|
||||||
|
trace_xprtrdma_mr_localinv(mr);
|
||||||
|
r_xprt->rx_stats.local_inv_needed++;
|
||||||
|
|
||||||
|
frwr = &mr->frwr;
|
||||||
|
frwr->fr_cqe.done = frwr_wc_localinv;
|
||||||
|
frwr->fr_req = req;
|
||||||
|
last = &frwr->fr_invwr;
|
||||||
|
last->next = NULL;
|
||||||
|
last->wr_cqe = &frwr->fr_cqe;
|
||||||
|
last->sg_list = NULL;
|
||||||
|
last->num_sge = 0;
|
||||||
|
last->opcode = IB_WR_LOCAL_INV;
|
||||||
|
last->send_flags = IB_SEND_SIGNALED;
|
||||||
|
last->ex.invalidate_rkey = mr->mr_handle;
|
||||||
|
|
||||||
|
*prev = last;
|
||||||
|
prev = &last->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Strong send queue ordering guarantees that when the
|
||||||
|
* last WR in the chain completes, all WRs in the chain
|
||||||
|
* are complete. The last completion will wake up the
|
||||||
|
* RPC waiter.
|
||||||
|
*/
|
||||||
|
frwr->fr_cqe.done = frwr_wc_localinv_done;
|
||||||
|
|
||||||
|
/* Transport disconnect drains the receive CQ before it
|
||||||
|
* replaces the QP. The RPC reply handler won't call us
|
||||||
|
* unless ri_id->qp is a valid pointer.
|
||||||
|
*/
|
||||||
|
bad_wr = NULL;
|
||||||
|
rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
|
||||||
|
trace_xprtrdma_post_send(req, rc);
|
||||||
|
if (!rc)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Recycle MRs in the LOCAL_INV chain that did not get posted.
|
||||||
|
*/
|
||||||
|
while (bad_wr) {
|
||||||
|
frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
|
||||||
|
mr = container_of(frwr, struct rpcrdma_mr, frwr);
|
||||||
|
bad_wr = bad_wr->next;
|
||||||
|
|
||||||
|
rpcrdma_mr_recycle(mr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The final LOCAL_INV WR in the chain is supposed to
|
||||||
|
* do the wake. If it was never posted, the wake will
|
||||||
|
* not happen, so wake here in that case.
|
||||||
|
*/
|
||||||
|
rpcrdma_complete_rqst(req->rl_reply);
|
||||||
|
}
|
||||||
|
@ -1268,24 +1268,15 @@ out_badheader:
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
/* Ensure that any DMA mapped pages associated with
|
||||||
|
* the Send of the RPC Call have been unmapped before
|
||||||
|
* allowing the RPC to complete. This protects argument
|
||||||
|
* memory not controlled by the RPC client from being
|
||||||
|
* re-used before we're done with it.
|
||||||
|
*/
|
||||||
|
static void rpcrdma_release_tx(struct rpcrdma_xprt *r_xprt,
|
||||||
|
struct rpcrdma_req *req)
|
||||||
{
|
{
|
||||||
/* Invalidate and unmap the data payloads before waking
|
|
||||||
* the waiting application. This guarantees the memory
|
|
||||||
* regions are properly fenced from the server before the
|
|
||||||
* application accesses the data. It also ensures proper
|
|
||||||
* send flow control: waking the next RPC waits until this
|
|
||||||
* RPC has relinquished all its Send Queue entries.
|
|
||||||
*/
|
|
||||||
if (!list_empty(&req->rl_registered))
|
|
||||||
frwr_unmap_sync(r_xprt, req);
|
|
||||||
|
|
||||||
/* Ensure that any DMA mapped pages associated with
|
|
||||||
* the Send of the RPC Call have been unmapped before
|
|
||||||
* allowing the RPC to complete. This protects argument
|
|
||||||
* memory not controlled by the RPC client from being
|
|
||||||
* re-used before we're done with it.
|
|
||||||
*/
|
|
||||||
if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
|
if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
|
||||||
r_xprt->rx_stats.reply_waits_for_send++;
|
r_xprt->rx_stats.reply_waits_for_send++;
|
||||||
out_of_line_wait_on_bit(&req->rl_flags,
|
out_of_line_wait_on_bit(&req->rl_flags,
|
||||||
@ -1295,24 +1286,23 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Reply handling runs in the poll worker thread. Anything that
|
/**
|
||||||
* might wait is deferred to a separate workqueue.
|
* rpcrdma_release_rqst - Release hardware resources
|
||||||
|
* @r_xprt: controlling transport instance
|
||||||
|
* @req: request with resources to release
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
void rpcrdma_deferred_completion(struct work_struct *work)
|
void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
|
||||||
{
|
{
|
||||||
struct rpcrdma_rep *rep =
|
if (!list_empty(&req->rl_registered))
|
||||||
container_of(work, struct rpcrdma_rep, rr_work);
|
frwr_unmap_sync(r_xprt, req);
|
||||||
struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
|
|
||||||
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
|
|
||||||
|
|
||||||
trace_xprtrdma_defer_cmp(rep);
|
rpcrdma_release_tx(r_xprt, req);
|
||||||
if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
|
|
||||||
frwr_reminv(rep, &req->rl_registered);
|
|
||||||
rpcrdma_release_rqst(r_xprt, req);
|
|
||||||
rpcrdma_complete_rqst(rep);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Process received RPC/RDMA messages.
|
/**
|
||||||
|
* rpcrdma_reply_handler - Process received RPC/RDMA messages
|
||||||
|
* @rep: Incoming rpcrdma_rep object to process
|
||||||
*
|
*
|
||||||
* Errors must result in the RPC task either being awakened, or
|
* Errors must result in the RPC task either being awakened, or
|
||||||
* allowed to timeout, to discover the errors at that time.
|
* allowed to timeout, to discover the errors at that time.
|
||||||
@ -1374,7 +1364,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
|
|||||||
rep->rr_rqst = rqst;
|
rep->rr_rqst = rqst;
|
||||||
|
|
||||||
trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
|
trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
|
||||||
queue_work(buf->rb_completion_wq, &rep->rr_work);
|
|
||||||
|
if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
|
||||||
|
frwr_reminv(rep, &req->rl_registered);
|
||||||
|
if (!list_empty(&req->rl_registered)) {
|
||||||
|
frwr_unmap_async(r_xprt, req);
|
||||||
|
/* LocalInv completion will complete the RPC */
|
||||||
|
} else {
|
||||||
|
rpcrdma_release_tx(r_xprt, req);
|
||||||
|
rpcrdma_complete_rqst(rep);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
|
|
||||||
out_badversion:
|
out_badversion:
|
||||||
|
@ -89,14 +89,12 @@ static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
|
|||||||
*/
|
*/
|
||||||
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
|
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
|
||||||
{
|
{
|
||||||
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
|
|
||||||
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
|
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
|
||||||
|
|
||||||
/* Flush Receives, then wait for deferred Reply work
|
/* Flush Receives, then wait for deferred Reply work
|
||||||
* to complete.
|
* to complete.
|
||||||
*/
|
*/
|
||||||
ib_drain_rq(ia->ri_id->qp);
|
ib_drain_rq(ia->ri_id->qp);
|
||||||
drain_workqueue(buf->rb_completion_wq);
|
|
||||||
|
|
||||||
/* Deferred Reply processing might have scheduled
|
/* Deferred Reply processing might have scheduled
|
||||||
* local invalidations.
|
* local invalidations.
|
||||||
@ -1056,7 +1054,6 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp)
|
|||||||
|
|
||||||
rep->rr_cqe.done = rpcrdma_wc_receive;
|
rep->rr_cqe.done = rpcrdma_wc_receive;
|
||||||
rep->rr_rxprt = r_xprt;
|
rep->rr_rxprt = r_xprt;
|
||||||
INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
|
|
||||||
rep->rr_recv_wr.next = NULL;
|
rep->rr_recv_wr.next = NULL;
|
||||||
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
|
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
|
||||||
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
|
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
|
||||||
@ -1117,15 +1114,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
|
|||||||
if (rc)
|
if (rc)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s",
|
|
||||||
WQ_MEM_RECLAIM | WQ_HIGHPRI,
|
|
||||||
0,
|
|
||||||
r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]);
|
|
||||||
if (!buf->rb_completion_wq) {
|
|
||||||
rc = -ENOMEM;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
out:
|
out:
|
||||||
rpcrdma_buffer_destroy(buf);
|
rpcrdma_buffer_destroy(buf);
|
||||||
@ -1199,11 +1187,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
|
|||||||
{
|
{
|
||||||
cancel_delayed_work_sync(&buf->rb_refresh_worker);
|
cancel_delayed_work_sync(&buf->rb_refresh_worker);
|
||||||
|
|
||||||
if (buf->rb_completion_wq) {
|
|
||||||
destroy_workqueue(buf->rb_completion_wq);
|
|
||||||
buf->rb_completion_wq = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
rpcrdma_sendctxs_destroy(buf);
|
rpcrdma_sendctxs_destroy(buf);
|
||||||
|
|
||||||
while (!list_empty(&buf->rb_recv_bufs)) {
|
while (!list_empty(&buf->rb_recv_bufs)) {
|
||||||
|
@ -202,10 +202,9 @@ struct rpcrdma_rep {
|
|||||||
bool rr_temp;
|
bool rr_temp;
|
||||||
struct rpcrdma_regbuf *rr_rdmabuf;
|
struct rpcrdma_regbuf *rr_rdmabuf;
|
||||||
struct rpcrdma_xprt *rr_rxprt;
|
struct rpcrdma_xprt *rr_rxprt;
|
||||||
struct work_struct rr_work;
|
struct rpc_rqst *rr_rqst;
|
||||||
struct xdr_buf rr_hdrbuf;
|
struct xdr_buf rr_hdrbuf;
|
||||||
struct xdr_stream rr_stream;
|
struct xdr_stream rr_stream;
|
||||||
struct rpc_rqst *rr_rqst;
|
|
||||||
struct list_head rr_list;
|
struct list_head rr_list;
|
||||||
struct ib_recv_wr rr_recv_wr;
|
struct ib_recv_wr rr_recv_wr;
|
||||||
};
|
};
|
||||||
@ -240,10 +239,12 @@ struct rpcrdma_sendctx {
|
|||||||
* An external memory region is any buffer or page that is registered
|
* An external memory region is any buffer or page that is registered
|
||||||
* on the fly (ie, not pre-registered).
|
* on the fly (ie, not pre-registered).
|
||||||
*/
|
*/
|
||||||
|
struct rpcrdma_req;
|
||||||
struct rpcrdma_frwr {
|
struct rpcrdma_frwr {
|
||||||
struct ib_mr *fr_mr;
|
struct ib_mr *fr_mr;
|
||||||
struct ib_cqe fr_cqe;
|
struct ib_cqe fr_cqe;
|
||||||
struct completion fr_linv_done;
|
struct completion fr_linv_done;
|
||||||
|
struct rpcrdma_req *fr_req;
|
||||||
union {
|
union {
|
||||||
struct ib_reg_wr fr_regwr;
|
struct ib_reg_wr fr_regwr;
|
||||||
struct ib_send_wr fr_invwr;
|
struct ib_send_wr fr_invwr;
|
||||||
@ -388,7 +389,6 @@ struct rpcrdma_buffer {
|
|||||||
u32 rb_bc_srv_max_requests;
|
u32 rb_bc_srv_max_requests;
|
||||||
u32 rb_bc_max_requests;
|
u32 rb_bc_max_requests;
|
||||||
|
|
||||||
struct workqueue_struct *rb_completion_wq;
|
|
||||||
struct delayed_work rb_refresh_worker;
|
struct delayed_work rb_refresh_worker;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -561,6 +561,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
|
|||||||
int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
|
int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
|
||||||
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
|
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
|
||||||
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
|
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
|
||||||
|
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
|
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
|
||||||
@ -585,7 +586,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
|
|||||||
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
|
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
|
||||||
void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
|
void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
|
||||||
struct rpcrdma_req *req);
|
struct rpcrdma_req *req);
|
||||||
void rpcrdma_deferred_completion(struct work_struct *work);
|
|
||||||
|
|
||||||
static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
|
static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user