From 3821e232eb3b7591f07ce4c389313ab55ebee372 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:15 -0500 Subject: [PATCH 01/90] xprtrdma: Replace dprintk call sites in ERR_CHUNK path Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 82 ++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 13 ++---- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index bf1065772228..d5e66428e27e 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1128,6 +1128,88 @@ DEFINE_REPLY_EVENT(xprtrdma_reply_rqst); DEFINE_REPLY_EVENT(xprtrdma_reply_short); DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); +TRACE_EVENT(xprtrdma_err_vers, + TP_PROTO( + const struct rpc_rqst *rqst, + __be32 *min, + __be32 *max + ), + + TP_ARGS(rqst, min, max), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, min) + __field(u32, max) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->min = be32_to_cpup(min); + __entry->max = be32_to_cpup(max); + ), + + TP_printk("task:%u@%u xid=0x%08x versions=[%u, %u]", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->min, __entry->max + ) +); + +TRACE_EVENT(xprtrdma_err_chunk, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + ), + + TP_printk("task:%u@%u xid=0x%08x", + __entry->task_id, __entry->client_id, __entry->xid + ) +); + +TRACE_EVENT(xprtrdma_err_unrecognized, + TP_PROTO( + const struct rpc_rqst *rqst, + __be32 *procedure + ), + + TP_ARGS(rqst, procedure), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, procedure) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->procedure = be32_to_cpup(procedure); + ), + + TP_printk("task:%u@%u xid=0x%08x procedure=%u", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->procedure + ) +); + TRACE_EVENT(xprtrdma_fixup, TP_PROTO( const struct rpc_rqst *rqst, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0f5120c7668f..c178f93aa40b 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1322,20 +1322,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } return -EIO; From af5865d2783958294179da56a4d073a9630b3068 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:21 -0500 Subject: [PATCH 02/90] xprtrdma: Introduce Receive completion IDs Set up a completion ID in each rpcrdma_rep. The ID is used to match an incoming Receive completion to a transport and to a previous ib_post_recv(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 46 ++++++--------------------------- net/sunrpc/xprtrdma/verbs.c | 6 ++++- net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++++ 3 files changed, 18 insertions(+), 39 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index d5e66428e27e..1c91c8e721e7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -771,15 +771,17 @@ TRACE_EVENT(xprtrdma_post_recv, TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, rep) + __field(u32, cq_id) + __field(int, completion_id) ), TP_fast_assign( - __entry->rep = rep; + __entry->cq_id = rep->rr_cid.ci_queue_id; + __entry->completion_id = rep->rr_cid.ci_completion_id; ), - TP_printk("rep=%p", - __entry->rep + TP_printk("cq.id=%d cid=%d", + __entry->cq_id, __entry->completion_id ) ); @@ -845,6 +847,8 @@ TRACE_EVENT(xprtrdma_post_linv, ** Completion events **/ +DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive); + TRACE_EVENT(xprtrdma_wc_send, TP_PROTO( const struct rpcrdma_sendctx *sc, @@ -876,40 +880,6 @@ TRACE_EVENT(xprtrdma_wc_send, ) ); -TRACE_EVENT(xprtrdma_wc_receive, - TP_PROTO( - const struct ib_wc *wc - ), - - TP_ARGS(wc), - - TP_STRUCT__entry( - __field(const void *, rep) - __field(u32, byte_len) - __field(unsigned int, status) - __field(u32, vendor_err) - ), - - TP_fast_assign( - __entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep, - rr_cqe); - __entry->status = wc->status; - if (wc->status) { - __entry->byte_len = 0; - __entry->vendor_err = wc->vendor_err; - } else { - __entry->byte_len = wc->byte_len; - __entry->vendor_err = 0; - } - ), - - TP_printk("rep=%p %u bytes: %s (%u/0x%x)", - __entry->rep, __entry->byte_len, - rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); - DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ad6e2e4994ce..2c8d2801ec4f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_receive(wc); + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -972,6 +972,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto out_free_regbuf; + rep->rr_cid.ci_completion_id = + atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1411,6 +1414,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rep) break; + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 43974ef39a50..b94940bc67aa 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -53,6 +53,7 @@ #include /* RDMA verbs api */ #include /* rpc_xprt */ +#include /* completion IDs */ #include /* RPC/RDMA protocol */ #include /* xprt parameters */ @@ -93,6 +94,8 @@ struct rpcrdma_ep { unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ unsigned int re_inline_recv; /* negotiated */ + + atomic_t re_completion_ids; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -180,6 +183,8 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + struct rpc_rdma_cid rr_cid; + __be32 rr_xid; __be32 rr_vers; __be32 rr_proc; From b2e7467f26d7813d98cbaad5e62b54960f2c071b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:26 -0500 Subject: [PATCH 03/90] xprtrdma: Introduce Send completion IDs Set up a completion ID in each rpcrdma_req. The ID is used to match an incoming Send completion to a transport and to a previous ib_post_send(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 47 +++++++-------------------------- net/sunrpc/xprtrdma/verbs.c | 5 +++- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 14 insertions(+), 39 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 1c91c8e721e7..ab239f4f924e 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -735,8 +735,8 @@ TRACE_EVENT(xprtrdma_post_send, TP_ARGS(req), TP_STRUCT__entry( - __field(const void *, req) - __field(const void *, sc) + __field(u32, cq_id) + __field(int, completion_id) __field(unsigned int, task_id) __field(unsigned int, client_id) __field(int, num_sge) @@ -745,20 +745,21 @@ TRACE_EVENT(xprtrdma_post_send, TP_fast_assign( const struct rpc_rqst *rqst = &req->rl_slot; + const struct rpcrdma_sendctx *sc = req->rl_sendctx; + __entry->cq_id = sc->sc_cid.ci_queue_id; + __entry->completion_id = sc->sc_cid.ci_completion_id; __entry->task_id = rqst->rq_task->tk_pid; __entry->client_id = rqst->rq_task->tk_client ? rqst->rq_task->tk_client->cl_clid : -1; - __entry->req = req; - __entry->sc = req->rl_sendctx; __entry->num_sge = req->rl_wr.num_sge; __entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED; ), - TP_printk("task:%u@%u req=%p sc=%p (%d SGE%s) %s", + TP_printk("task:%u@%u cq.id=%u cid=%d (%d SGE%s) %s", __entry->task_id, __entry->client_id, - __entry->req, __entry->sc, __entry->num_sge, - (__entry->num_sge == 1 ? "" : "s"), + __entry->cq_id, __entry->completion_id, + __entry->num_sge, (__entry->num_sge == 1 ? "" : "s"), (__entry->signaled ? "signaled" : "") ) ); @@ -848,37 +849,7 @@ TRACE_EVENT(xprtrdma_post_linv, **/ DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive); - -TRACE_EVENT(xprtrdma_wc_send, - TP_PROTO( - const struct rpcrdma_sendctx *sc, - const struct ib_wc *wc - ), - - TP_ARGS(sc, wc), - - TP_STRUCT__entry( - __field(const void *, req) - __field(const void *, sc) - __field(unsigned int, unmap_count) - __field(unsigned int, status) - __field(unsigned int, vendor_err) - ), - - TP_fast_assign( - __entry->req = sc->sc_req; - __entry->sc = sc; - __entry->unmap_count = sc->sc_unmap_count; - __entry->status = wc->status; - __entry->vendor_err = __entry->status ? wc->vendor_err : 0; - ), - - TP_printk("req=%p sc=%p unmapped=%u: %s (%u/0x%x)", - __entry->req, __entry->sc, __entry->unmap_count, - rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2c8d2801ec4f..63837b5d14e5 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_send(sc, wc); + trace_xprtrdma_wc_send(wc, &sc->sc_cid); rpcrdma_sendctx_put_locked(r_xprt, sc); rpcrdma_flush_disconnect(r_xprt, wc); } @@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) return NULL; sc->sc_cqe.done = rpcrdma_wc_send; + sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; + sc->sc_cid.ci_completion_id = + atomic_inc_return(&ep->re_completion_ids); return sc; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b94940bc67aa..4eb8e32b9f4a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -216,6 +216,7 @@ enum { struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_cqe sc_cqe; + struct rpc_rdma_cid sc_cid; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; From 5ecef9c84366955f61e379140c1065d576c66ada Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:31 -0500 Subject: [PATCH 04/90] xprtrdma: Introduce FRWR completion IDs Set up a completion ID in each rpcrdma_frwr. The ID is used to match an incoming completion to a transport (CQ) and other MR-related activity. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 44 +++------------------------------ net/sunrpc/xprtrdma/frwr_ops.c | 29 ++++++++++++++++------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 27 insertions(+), 47 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ab239f4f924e..9e30f8aa3562 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -261,41 +261,6 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, ), \ TP_ARGS(task, mr, nsegs)) -DECLARE_EVENT_CLASS(xprtrdma_frwr_done, - TP_PROTO( - const struct ib_wc *wc, - const struct rpcrdma_frwr *frwr - ), - - TP_ARGS(wc, frwr), - - TP_STRUCT__entry( - __field(u32, mr_id) - __field(unsigned int, status) - __field(unsigned int, vendor_err) - ), - - TP_fast_assign( - __entry->mr_id = frwr->fr_mr->res.id; - __entry->status = wc->status; - __entry->vendor_err = __entry->status ? wc->vendor_err : 0; - ), - - TP_printk( - "mr.id=%u: %s (%u/0x%x)", - __entry->mr_id, rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); - -#define DEFINE_FRWR_DONE_EVENT(name) \ - DEFINE_EVENT(xprtrdma_frwr_done, name, \ - TP_PROTO( \ - const struct ib_wc *wc, \ - const struct rpcrdma_frwr *frwr \ - ), \ - TP_ARGS(wc, frwr)) - TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL); TRACE_DEFINE_ENUM(DMA_TO_DEVICE); TRACE_DEFINE_ENUM(DMA_FROM_DEVICE); @@ -850,11 +815,10 @@ TRACE_EVENT(xprtrdma_post_linv, DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive); DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); - -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_done); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_li); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done); TRACE_EVENT(xprtrdma_frwr_alloc, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 44888f5badef..2cc6862a52dc 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -363,12 +363,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, frwr); + trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); /* The MR will get recycled when the associated req is retransmitted */ rpcrdma_flush_disconnect(cq->cq_context, wc); } +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_frwr *frwr) +{ + struct rpc_rdma_cid *cid = &frwr->fr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = frwr->fr_mr->res.id; +} + /** * frwr_send - post Send WRs containing the RPC Call message * @r_xprt: controlling transport instance @@ -385,6 +394,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -395,6 +405,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; + frwr_cid_init(ep, frwr); frwr->fr_regwr.wr.next = post_wr; frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; frwr->fr_regwr.wr.num_sge = 0; @@ -404,7 +415,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); + return ib_post_send(ep->re_id->qp, post_wr, NULL); } /** @@ -448,7 +459,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, frwr); + trace_xprtrdma_wc_li(wc, &frwr->fr_cid); __frwr_release_mr(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -469,7 +480,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, frwr); + trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); __frwr_release_mr(wc, mr); complete(&frwr->fr_linv_done); @@ -490,6 +501,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, **prev, *last; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -509,6 +521,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -534,7 +547,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -574,7 +587,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, frwr); + trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); __frwr_release_mr(wc, mr); /* Ensure @rep is generated before __frwr_release_mr */ @@ -597,6 +610,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -614,6 +628,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -639,7 +654,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 4eb8e32b9f4a..cef9d0f2e2c8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,6 +231,7 @@ struct rpcrdma_sendctx { struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; + struct rpc_rdma_cid fr_cid; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; From 36a55edfc3d5b1c2735c088bcb06967de103f299 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:37 -0500 Subject: [PATCH 05/90] xprtrdma: Clean up trace_xprtrdma_post_linv - Replace the display of kernel memory addresses - Add "_err" to the end of its name to indicate that it's a tracepoint that fires only when there's an error Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 16 +++++++++------- net/sunrpc/xprtrdma/frwr_ops.c | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 9e30f8aa3562..b0750c0d2753 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -784,7 +784,7 @@ TRACE_EVENT(xprtrdma_post_recvs, ) ); -TRACE_EVENT(xprtrdma_post_linv, +TRACE_EVENT(xprtrdma_post_linv_err, TP_PROTO( const struct rpcrdma_req *req, int status @@ -793,19 +793,21 @@ TRACE_EVENT(xprtrdma_post_linv, TP_ARGS(req, status), TP_STRUCT__entry( - __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) __field(int, status) - __field(u32, xid) ), TP_fast_assign( - __entry->req = req; + const struct rpc_task *task = req->rl_slot.rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; __entry->status = status; - __entry->xid = be32_to_cpu(req->rl_slot.rq_xid); ), - TP_printk("req=%p xid=0x%08x status=%d", - __entry->req, __entry->xid, __entry->status + TP_printk("task:%u@%u status=%d", + __entry->task_id, __entry->client_id, __entry->status ) ); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 2cc6862a52dc..76322b1acf3d 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -560,7 +560,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); @@ -660,7 +660,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); From 3a9568fedccc6cf26c1a87621c3bfed7b7432119 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:42 -0500 Subject: [PATCH 06/90] xprtrdma: Clean up reply parsing error tracepoints - Rename the tracepoints with the "_err" suffix to indicate these are rare error events - Replace display of kernel memory addresses - Tie the XID and error to a connection IP address instead Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 27 ++++++++++++++------------- net/sunrpc/xprtrdma/rpc_rdma.c | 10 +++++----- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index b0750c0d2753..93d717d8139f 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) -DECLARE_EVENT_CLASS(xprtrdma_reply_event, +DECLARE_EVENT_CLASS(xprtrdma_reply_class, TP_PROTO( const struct rpcrdma_rep *rep ), @@ -68,29 +68,30 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_event, TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, rep) - __field(const void *, r_xprt) __field(u32, xid) __field(u32, version) __field(u32, proc) + __string(addr, rpcrdma_addrstr(rep->rr_rxprt)) + __string(port, rpcrdma_portstr(rep->rr_rxprt)) ), TP_fast_assign( - __entry->rep = rep; - __entry->r_xprt = rep->rr_rxprt; __entry->xid = be32_to_cpu(rep->rr_xid); __entry->version = be32_to_cpu(rep->rr_vers); __entry->proc = be32_to_cpu(rep->rr_proc); + __assign_str(addr, rpcrdma_addrstr(rep->rr_rxprt)); + __assign_str(port, rpcrdma_portstr(rep->rr_rxprt)); ), - TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u", - __entry->r_xprt, __entry->xid, __entry->rep, - __entry->version, __entry->proc + TP_printk("peer=[%s]:%s xid=0x%08x version=%u proc=%u", + __get_str(addr), __get_str(port), + __entry->xid, __entry->version, __entry->proc ) ); #define DEFINE_REPLY_EVENT(name) \ - DEFINE_EVENT(xprtrdma_reply_event, name, \ + DEFINE_EVENT(xprtrdma_reply_class, \ + xprtrdma_reply_##name##_err, \ TP_PROTO( \ const struct rpcrdma_rep *rep \ ), \ @@ -1030,10 +1031,10 @@ TRACE_EVENT(xprtrdma_defer_cmp, ) ); -DEFINE_REPLY_EVENT(xprtrdma_reply_vers); -DEFINE_REPLY_EVENT(xprtrdma_reply_rqst); -DEFINE_REPLY_EVENT(xprtrdma_reply_short); -DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); +DEFINE_REPLY_EVENT(vers); +DEFINE_REPLY_EVENT(rqst); +DEFINE_REPLY_EVENT(short); +DEFINE_REPLY_EVENT(hdr); TRACE_EVENT(xprtrdma_err_vers, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c178f93aa40b..29f847c8f609 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -1369,7 +1369,7 @@ out: return; out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; rqst->rq_task->tk_status = status; status = 0; @@ -1462,16 +1462,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: rpcrdma_recv_buffer_put(rep); From 03ffd92494a53dcc4b98c909ae1f6787d1fec646 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:47 -0500 Subject: [PATCH 07/90] xprtrdma: Clean up tracepoints in the reply path Replace unnecessary display of kernel memory addresses. Also, there are no longer any trace_xprtrdma_defer_cmp() call sites. And remove the trace_xprtrdma_leaked_rep() tracepoint because there doesn't seem to be an overwhelming need to have a tracepoint for catching a software bug that has long since been fixed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 66 ++-------------------------------- net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++-- 2 files changed, 5 insertions(+), 67 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 93d717d8139f..c28bf17e769b 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -974,17 +974,14 @@ TRACE_EVENT(xprtrdma_reply, TP_PROTO( const struct rpc_task *task, const struct rpcrdma_rep *rep, - const struct rpcrdma_req *req, unsigned int credits ), - TP_ARGS(task, rep, req, credits), + TP_ARGS(task, rep, credits), TP_STRUCT__entry( __field(unsigned int, task_id) __field(unsigned int, client_id) - __field(const void *, rep) - __field(const void *, req) __field(u32, xid) __field(unsigned int, credits) ), @@ -992,42 +989,13 @@ TRACE_EVENT(xprtrdma_reply, TP_fast_assign( __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->rep = rep; - __entry->req = req; __entry->xid = be32_to_cpu(rep->rr_xid); __entry->credits = credits; ), - TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p", + TP_printk("task:%u@%u xid=0x%08x credits=%u", __entry->task_id, __entry->client_id, __entry->xid, - __entry->credits, __entry->rep, __entry->req - ) -); - -TRACE_EVENT(xprtrdma_defer_cmp, - TP_PROTO( - const struct rpcrdma_rep *rep - ), - - TP_ARGS(rep), - - TP_STRUCT__entry( - __field(unsigned int, task_id) - __field(unsigned int, client_id) - __field(const void *, rep) - __field(u32, xid) - ), - - TP_fast_assign( - __entry->task_id = rep->rr_rqst->rq_task->tk_pid; - __entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid; - __entry->rep = rep; - __entry->xid = be32_to_cpu(rep->rr_xid); - ), - - TP_printk("task:%u@%u xid=0x%08x rep=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->rep + __entry->credits ) ); @@ -1212,34 +1180,6 @@ TRACE_EVENT(xprtrdma_cb_setup, DEFINE_CB_EVENT(xprtrdma_cb_call); DEFINE_CB_EVENT(xprtrdma_cb_reply); -TRACE_EVENT(xprtrdma_leaked_rep, - TP_PROTO( - const struct rpc_rqst *rqst, - const struct rpcrdma_rep *rep - ), - - TP_ARGS(rqst, rep), - - TP_STRUCT__entry( - __field(unsigned int, task_id) - __field(unsigned int, client_id) - __field(u32, xid) - __field(const void *, rep) - ), - - TP_fast_assign( - __entry->task_id = rqst->rq_task->tk_pid; - __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->rep = rep; - ), - - TP_printk("task:%u@%u xid=0x%08x rep=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->rep - ) -); - /** ** Server-side RPC/RDMA events **/ diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 29f847c8f609..8078559bdc31 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1443,14 +1443,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + if (unlikely(req->rl_reply)) rpcrdma_recv_buffer_put(req->rl_reply); - } req->rl_reply = rep; rep->rr_rqst = rqst; - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) frwr_reminv(rep, &req->rl_registered); From d11e934606ef6ce37a4bcbe89f34faf37347abb1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:53 -0500 Subject: [PATCH 08/90] xprtrdma: Clean up xprtrdma callback tracepoints - Replace displayed kernel memory addresses - Tie the XID and event with the peer's IP address Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 31 ++++++++++++++++--------------- net/sunrpc/xprtrdma/backchannel.c | 6 +++--- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c28bf17e769b..6bdbe1165270 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -313,38 +313,39 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ), \ TP_ARGS(mr)) -DECLARE_EVENT_CLASS(xprtrdma_cb_event, +DECLARE_EVENT_CLASS(xprtrdma_callback_class, TP_PROTO( + const struct rpcrdma_xprt *r_xprt, const struct rpc_rqst *rqst ), - TP_ARGS(rqst), + TP_ARGS(r_xprt, rqst), TP_STRUCT__entry( - __field(const void *, rqst) - __field(const void *, rep) - __field(const void *, req) __field(u32, xid) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->rqst = rqst; - __entry->req = rpcr_to_rdmar(rqst); - __entry->rep = rpcr_to_rdmar(rqst)->rl_reply; __entry->xid = be32_to_cpu(rqst->rq_xid); + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p", - __entry->xid, __entry->rqst, __entry->req, __entry->rep + TP_printk("peer=[%s]:%s xid=0x%08x", + __get_str(addr), __get_str(port), __entry->xid ) ); -#define DEFINE_CB_EVENT(name) \ - DEFINE_EVENT(xprtrdma_cb_event, name, \ +#define DEFINE_CALLBACK_EVENT(name) \ + DEFINE_EVENT(xprtrdma_callback_class, \ + xprtrdma_cb_##name, \ TP_PROTO( \ + const struct rpcrdma_xprt *r_xprt, \ const struct rpc_rqst *rqst \ ), \ - TP_ARGS(rqst)) + TP_ARGS(r_xprt, rqst)) /** ** Connection events @@ -1177,8 +1178,8 @@ TRACE_EVENT(xprtrdma_cb_setup, ) ); -DEFINE_CB_EVENT(xprtrdma_cb_call); -DEFINE_CB_EVENT(xprtrdma_cb_reply); +DEFINE_CALLBACK_EVENT(call); +DEFINE_CALLBACK_EVENT(reply); /** ** Server-side RPC/RDMA events diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index c92c1aac270a..946edf2db646 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2020, Oracle and/or its affiliates. * * Support for backward direction RPCs on RPC/RDMA. */ @@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; - trace_xprtrdma_cb_reply(rqst); + trace_xprtrdma_cb_reply(r_xprt, rqst); return 0; } @@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, */ req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - trace_xprtrdma_cb_call(rqst); + trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; From 0307cdec7c3412d7665363ab0cd61fccf82bfb2d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:39:58 -0500 Subject: [PATCH 09/90] xprtrdma: Clean up trace_xprtrdma_nomrs() - Rename it following the "_err" suffix convention - Replace display of kernel memory addresses - Tie MR exhaustion to a peer IP address, similar to the createmrs tracepoint Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 19 ++++++++++--------- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 6bdbe1165270..4fcda2a25bb8 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -545,32 +545,33 @@ TRACE_EVENT(xprtrdma_mr_get, ) ); -TRACE_EVENT(xprtrdma_nomrs, +TRACE_EVENT(xprtrdma_nomrs_err, TP_PROTO( + const struct rpcrdma_xprt *r_xprt, const struct rpcrdma_req *req ), - TP_ARGS(req), + TP_ARGS(r_xprt, req), TP_STRUCT__entry( - __field(const void *, req) __field(unsigned int, task_id) __field(unsigned int, client_id) - __field(u32, xid) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( const struct rpc_rqst *rqst = &req->rl_slot; - __entry->req = req; __entry->task_id = rqst->rq_task->tk_pid; __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->xid = be32_to_cpu(rqst->rq_xid); + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("task:%u@%u xid=0x%08x req=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->req + TP_printk("peer=[%s]:%s task:%u@%u", + __get_str(addr), __get_str(port), + __entry->task_id, __entry->client_id ) ); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 8078559bdc31..f27eb2322b38 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -323,7 +323,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: - trace_xprtrdma_nomrs(req); + trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); From 7703db978d4cf7c51426183b7c0d03c039757a44 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:40:03 -0500 Subject: [PATCH 10/90] xprtrdma: Display the task ID when reporting MR events Tie each MR event to the requesting rpc_task to make it easier to follow MR ownership and control flow. MR unmapping and recycling can happen in the background, after an MR's mr_req field is stale, so set up a separate tracepoint class for those events. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 96 ++++++++++++++++++++-------------- net/sunrpc/xprtrdma/frwr_ops.c | 1 - net/sunrpc/xprtrdma/rpc_rdma.c | 1 - 3 files changed, 58 insertions(+), 40 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 4fcda2a25bb8..166bbeef996c 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -274,7 +274,55 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_FROM_DEVICE, "FROM_DEVICE" }, \ { DMA_NONE, "NONE" }) -DECLARE_EVENT_CLASS(xprtrdma_mr, +DECLARE_EVENT_CLASS(xprtrdma_mr_class, + TP_PROTO( + const struct rpcrdma_mr *mr + ), + + TP_ARGS(mr), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, mr_id) + __field(int, nents) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + __field(u32, dir) + ), + + TP_fast_assign( + const struct rpcrdma_req *req = mr->mr_req; + const struct rpc_task *task = req->rl_slot.rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->nents = mr->mr_nents; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + __entry->dir = mr->mr_dir; + ), + + TP_printk("task:%u@%u mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)", + __entry->task_id, __entry->client_id, + __entry->mr_id, __entry->nents, __entry->length, + (unsigned long long)__entry->offset, __entry->handle, + xprtrdma_show_direction(__entry->dir) + ) +); + +#define DEFINE_MR_EVENT(name) \ + DEFINE_EVENT(xprtrdma_mr_class, \ + xprtrdma_mr_##name, \ + TP_PROTO( \ + const struct rpcrdma_mr *mr \ + ), \ + TP_ARGS(mr)) + +DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class, TP_PROTO( const struct rpcrdma_mr *mr ), @@ -306,11 +354,12 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ) ); -#define DEFINE_MR_EVENT(name) \ - DEFINE_EVENT(xprtrdma_mr, xprtrdma_mr_##name, \ - TP_PROTO( \ - const struct rpcrdma_mr *mr \ - ), \ +#define DEFINE_ANON_MR_EVENT(name) \ + DEFINE_EVENT(xprtrdma_anonymous_mr_class, \ + xprtrdma_mr_##name, \ + TP_PROTO( \ + const struct rpcrdma_mr *mr \ + ), \ TP_ARGS(mr)) DECLARE_EVENT_CLASS(xprtrdma_callback_class, @@ -516,35 +565,6 @@ TRACE_EVENT(xprtrdma_createmrs, ) ); -TRACE_EVENT(xprtrdma_mr_get, - TP_PROTO( - const struct rpcrdma_req *req - ), - - TP_ARGS(req), - - TP_STRUCT__entry( - __field(const void *, req) - __field(unsigned int, task_id) - __field(unsigned int, client_id) - __field(u32, xid) - ), - - TP_fast_assign( - const struct rpc_rqst *rqst = &req->rl_slot; - - __entry->req = req; - __entry->task_id = rqst->rq_task->tk_pid; - __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->xid = be32_to_cpu(rqst->rq_xid); - ), - - TP_printk("task:%u@%u xid=0x%08x req=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->req - ) -); - TRACE_EVENT(xprtrdma_nomrs_err, TP_PROTO( const struct rpcrdma_xprt *r_xprt, @@ -946,9 +966,9 @@ TRACE_EVENT(xprtrdma_frwr_maperr, DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); -DEFINE_MR_EVENT(unmap); -DEFINE_MR_EVENT(reminv); -DEFINE_MR_EVENT(recycle); + +DEFINE_ANON_MR_EVENT(unmap); +DEFINE_ANON_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 76322b1acf3d..cb2f92409c2f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -431,7 +431,6 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_reminv(mr); rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f27eb2322b38..9ed89872ec75 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -315,7 +315,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, *mr = rpcrdma_mr_get(r_xprt); if (!*mr) goto out_getmr_err; - trace_xprtrdma_mr_get(req); (*mr)->mr_req = req; } From 8e24e191d44f49f08f857f0ebc6fe91961cd1a09 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:40:08 -0500 Subject: [PATCH 11/90] xprtrdma: Trace unmap_sync calls ->buf_free is called nearly once per RPC. Only rarely does xprt_rdma_free() have to do anything, thus tracing every one of these calls seems unnecessary. Instead, just throw a trace event when that one occasional RPC still has MRs that need to be released. xprt_rdma_free() is further micro-optimized to reduce the amount of work done in the common case. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 22 ++++++++++++++++++++++ net/sunrpc/xprtrdma/transport.c | 7 ++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 166bbeef996c..69e1caf7e882 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1167,6 +1167,28 @@ TRACE_EVENT(xprtrdma_decode_seg, ) ); +TRACE_EVENT(xprtrdma_mrs_zap, + TP_PROTO( + const struct rpc_task *task + ), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + ), + + TP_printk("task:%u@%u", + __entry->task_id, __entry->client_id + ) +); + /** ** Callback events **/ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8915e42240d3..bb3ed3db6c0a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -599,11 +599,12 @@ static void xprt_rdma_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, req); + if (unlikely(!list_empty(&req->rl_registered))) { + trace_xprtrdma_mrs_zap(task); + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } /* XXX: If the RPC is completing because of a signal and * not because a reply was received, we ought to ensure From ef2be5918ff57bea9cd1e63968c73b1358a765ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:40:14 -0500 Subject: [PATCH 12/90] xprtrdma: Move rpcrdma_mr_put() Clean up: This function is now invoked only in frwr_ops.c. The move enables deduplication of the trace_xprtrdma_mr_unmap() call site. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 41 ++++++++++++++++++++++----------- net/sunrpc/xprtrdma/verbs.c | 19 --------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index cb2f92409c2f..e93b3457b958 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -static void frwr_mr_recycle(struct rpcrdma_mr *mr) +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - trace_xprtrdma_mr_recycle(mr); - if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } +} + +static void frwr_mr_recycle(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + frwr_mr_unmap(r_xprt, mr); spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); @@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) frwr_release_mr(mr); } +static void frwr_mr_put(struct rpcrdma_mr *mr) +{ + frwr_mr_unmap(mr->mr_xprt, mr); + + /* The MR is returned to the req's MR free list instead + * of to the xprt's MR free list. No spinlock is needed. + */ + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -431,17 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - rpcrdma_mr_put(mr); + frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } } -static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) +static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { if (wc->status != IB_WC_SUCCESS) frwr_mr_recycle(mr); else - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -459,7 +474,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li(wc, &frwr->fr_cid); - __frwr_release_mr(wc, mr); + frwr_mr_done(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -480,7 +495,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); - __frwr_release_mr(wc, mr); + frwr_mr_done(wc, mr); complete(&frwr->fr_linv_done); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -587,9 +602,9 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); - __frwr_release_mr(wc, mr); + frwr_mr_done(wc, mr); - /* Ensure @rep is generated before __frwr_release_mr */ + /* Ensure @rep is generated before frwr_mr_done */ smp_rmb(); rpcrdma_complete_rqst(rep); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 63837b5d14e5..ec912cf9c618 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1184,25 +1184,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) return mr; } -/** - * rpcrdma_mr_put - DMA unmap an MR and release it - * @mr: MR to release - * - */ -void rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } - - rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); -} - /** * rpcrdma_buffer_get - Get a request buffer * @buffers: Buffer pool from which to obtain a buffer diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cef9d0f2e2c8..6a45bf241ec0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -473,7 +473,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); -void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); From 7a03aeb66c410366acc5439ae2a341f110c4f845 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Nov 2020 14:40:19 -0500 Subject: [PATCH 13/90] xprtrdma: Micro-optimize MR DMA-unmapping Now that rpcrdma_ep is no longer part of rpcrdma_xprt, there are four or five serial address dereferences needed to get to the IB device needed for DMA unmapping. Instead, let's use the same pattern that regbufs use: cache a pointer to the device in the MR, and use that as the indication that unmapping is necessary. This also guarantees that the exact same device is used for DMA mapping and unmapping, even if the r_xprt's ep has been replaced. I don't think this can happen today, but future changes might break this assumption. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 12 ++++++------ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index e93b3457b958..baca49fe83af 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -67,11 +67,11 @@ void frwr_release_mr(struct rpcrdma_mr *mr) static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - if (mr->mr_dir != DMA_NONE) { + if (mr->mr_device) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; } } @@ -145,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->mr_xprt = r_xprt; mr->frwr.fr_mr = frmr; - mr->mr_dir = DMA_NONE; + mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->frwr.fr_linv_done); @@ -330,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; + mr->mr_device = ep->re_id->device; ibmr = mr->frwr.fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); @@ -356,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); return ERR_PTR(-EIO); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 6a45bf241ec0..94b28657aeeb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -243,6 +243,7 @@ struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; From 0359af7ac318495432e5a06f671c80dc29274f18 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 30 Nov 2020 14:58:19 -0500 Subject: [PATCH 14/90] SUNRPC: Remove XDRBUF_SPARSE_PAGES flag in gss_proxy upcall There's no need to defer allocation of pages for the receive buffer. - This upcall is quite infrequent - gssp_alloc_receive_pages() can allocate the pages with GFP_KERNEL, unlike the transport - gssp_alloc_receive_pages() knows exactly how many pages are needed Signed-off-by: Chuck Lever Reviewed-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/gss_rpc_upcall.c | 15 ++++++++++----- net/sunrpc/auth_gss/gss_rpc_xdr.c | 1 - 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index af9c7f43859c..d1c003a25b0f 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -200,7 +200,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg) static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { - int i; + unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); @@ -210,14 +210,19 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { + unsigned int i; + arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); - /* - * XXX: actual pages are allocated by xdr layer in - * xdr_partial_copy_from_skb. - */ if (!arg->pages) return -ENOMEM; + for (i = 0; i < arg->npages; i++) { + arg->pages[i] = alloc_page(GFP_KERNEL); + if (!arg->pages[i]) { + gssp_free_receive_pages(arg); + return -ENOMEM; + } + } return 0; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 2ff7b7083eba..44838f6ea25e 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); From 5482e09a8840ee3bb1848d73d05301eafc0e6199 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Nov 2020 19:15:18 -0500 Subject: [PATCH 15/90] NFS: Fix rpcrdma_inline_fixup() crash with new LISTXATTRS operation By switching to an XFS-backed export, I am able to reproduce the ibcomp worker crash on my client with xfstests generic/013. For the failing LISTXATTRS operation, xdr_inline_pages() is called with page_len=12 and buflen=128. - When ->send_request() is called, rpcrdma_marshal_req() does not set up a Reply chunk because buflen is smaller than the inline threshold. Thus rpcrdma_convert_iovs() does not get invoked at all and the transport's XDRBUF_SPARSE_PAGES logic is not invoked on the receive buffer. - During reply processing, rpcrdma_inline_fixup() tries to copy received data into rq_rcv_buf->pages because page_len is positive. But there are no receive pages because rpcrdma_marshal_req() never allocated them. The result is that the ibcomp worker faults and dies. Sometimes that causes a visible crash, and sometimes it results in a transport hang without other symptoms. RPC/RDMA's XDRBUF_SPARSE_PAGES support is not entirely correct, and should eventually be fixed or replaced. However, my preference is that upper-layer operations should explicitly allocate their receive buffers (using GFP_KERNEL) when possible, rather than relying on XDRBUF_SPARSE_PAGES. Reported-by: Olga kornievskaia Suggested-by: Olga kornievskaia Fixes: c10a75145feb ("NFSv4.2: add the extended attribute proc functions.") Signed-off-by: Chuck Lever Reviewed-by: Olga kornievskaia Reviewed-by: Frank van der Linden Tested-by: Olga kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 21 +++++++++++++-------- fs/nfs/nfs42xdr.c | 1 - 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 2b2211d1234e..4fc61e3d098d 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1241,12 +1241,13 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, .rpc_resp = &res, }; u32 xdrlen; - int ret, np; + int ret, np, i; + ret = -ENOMEM; res.scratch = alloc_page(GFP_KERNEL); if (!res.scratch) - return -ENOMEM; + goto out; xdrlen = nfs42_listxattr_xdrsize(buflen); if (xdrlen > server->lxasize) @@ -1254,9 +1255,12 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, np = xdrlen / PAGE_SIZE + 1; pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { - __free_page(res.scratch); - return -ENOMEM; + if (!pages) + goto out_free_scratch; + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free_pages; } arg.xattr_pages = pages; @@ -1271,14 +1275,15 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, *eofp = res.eof; } +out_free_pages: while (--np >= 0) { if (pages[np]) __free_page(pages[np]); } - - __free_page(res.scratch); kfree(pages); - +out_free_scratch: + __free_page(res.scratch); +out: return ret; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6e060a88f98c..8432bd6b95f0 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1528,7 +1528,6 @@ static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, hdr.replen); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; encode_nops(&hdr); } From bd75475c2fa161acec0017e030f6e8c01cb0d107 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 23 Nov 2020 22:15:17 -0500 Subject: [PATCH 16/90] NFSv4.2: Fix 5 seconds delay when doing inter server copy Since commit b4868b44c5628 ("NFSv4: Wait for stateid updates after CLOSE/OPEN_DOWNGRADE"), every inter server copy operation suffers 5 seconds delay regardless of the size of the copy. The delay is from nfs_set_open_stateid_locked when the check by nfs_stateid_is_sequential fails because the seqid in both nfs4_state and nfs4_stateid are 0. Fix __nfs42_ssc_open to delay setting of NFS_OPEN_STATE in nfs4_state, until after the call to update_open_stateid, to indicate this is the 1st open. This fix is part of a 2 patches, the other patch is the fix in the source server to return the stateid for COPY_NOTIFY request with seqid 1 instead of 0. Fixes: ce0887ac96d3 ("NFSD add nfs4 inter ssc to nfsd4_copy") Signed-off-by: Dai Ngo Signed-off-by: Anna Schumaker --- fs/nfs/nfs4file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 9d354de613da..57b3821d975a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -377,10 +377,10 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out_stateowner; set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags); - set_bit(NFS_OPEN_STATE, &ctx->state->flags); memcpy(&ctx->state->open_stateid.other, &stateid->other, NFS4_STATEID_OTHER_SIZE); update_open_stateid(ctx->state, stateid, NULL, filep->f_mode); + set_bit(NFS_OPEN_STATE, &ctx->state->flags); nfs_file_set_open_context(filep, ctx); put_nfs_open_context(ctx); From 5f447cb88123857d69f0c1a665356688037c6ad3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 19 Oct 2020 12:40:59 -0400 Subject: [PATCH 17/90] NFSv3: Refactor nfs3_proc_lookup() to split out the dentry We want to reuse the lookup code in NFSv3 in order to emulate the NFSv4 lookupp operation. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2397ceedba8a..acbdf7496d31 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -154,14 +154,14 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, - struct nfs_fh *fhandle, struct nfs_fattr *fattr, - struct nfs4_label *label) +__nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + unsigned short task_flags) { struct nfs3_diropargs arg = { .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len + .name = name, + .len = len }; struct nfs3_diropres res = { .fh = fhandle, @@ -173,17 +173,11 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, .rpc_resp = &res, }; int status; - unsigned short task_flags = 0; - - /* Is this is an attribute revalidation, subject to softreval? */ - if (nfs_lookup_is_soft_revalidate(dentry)) - task_flags |= RPC_TASK_TIMEOUT; res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) return -ENOMEM; - dprintk("NFS call lookup %pd2\n", dentry); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags); nfs_refresh_inode(dir, res.dir_attr); @@ -198,6 +192,23 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, return status; } +static int +nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + unsigned short task_flags = 0; + + /* Is this is an attribute revalidation, subject to softreval? */ + if (nfs_lookup_is_soft_revalidate(dentry)) + task_flags |= RPC_TASK_TIMEOUT; + + dprintk("NFS call lookup %pd2\n", dentry); + return __nfs3_proc_lookup(dir, dentry->d_name.name, + dentry->d_name.len, fhandle, fattr, + task_flags); +} + static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs3_accessargs arg = { From 3c5e9a59faa6b1bb3110b961e695502c7ee8699b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 19 Oct 2020 12:45:53 -0400 Subject: [PATCH 18/90] NFSv3: Add emulation of the lookupp() operation In order to use the open_by_filehandle() operations on NFSv3, we need to be able to emulate lookupp() so that nfs_get_parent() can be used to convert disconnected dentries into connected ones. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index acbdf7496d31..6b66b73a50eb 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -209,6 +209,20 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, task_flags); } +static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + const char dotdot[] = ".."; + const size_t len = strlen(dotdot); + unsigned short task_flags = 0; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + task_flags |= RPC_TASK_TIMEOUT; + + return __nfs3_proc_lookup(inode, dotdot, len, fhandle, fattr, + task_flags); +} + static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs3_accessargs arg = { @@ -1015,6 +1029,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, .lookup = nfs3_proc_lookup, + .lookupp = nfs3_proc_lookupp, .access = nfs3_proc_access, .readlink = nfs3_proc_readlink, .create = nfs3_proc_create, From 76998ebb91582812540f591f9e148daa0f08c76d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Oct 2020 14:30:35 -0400 Subject: [PATCH 19/90] NFSv4: Observe the NFS_MOUNT_SOFTREVAL flag in _nfs4_proc_lookupp We need to respect the NFS_MOUNT_SOFTREVAL flag in _nfs4_proc_lookupp, by timing out if the server is unavailable. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9e0ca9b2b210..a5b9356bee6a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4397,6 +4397,10 @@ static int _nfs4_proc_lookupp(struct inode *inode, .rpc_argp = &args, .rpc_resp = &res, }; + unsigned short task_flags = 0; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + task_flags |= RPC_TASK_TIMEOUT; args.bitmask = nfs4_bitmask(server, label); @@ -4404,7 +4408,7 @@ static int _nfs4_proc_lookupp(struct inode *inode, dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, 0); + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } From e4c72201b6ec3173dfe13fa2e2335a3ad78d4921 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 22 Oct 2020 17:40:33 -0400 Subject: [PATCH 20/90] SUNRPC: rpc_wake_up() should wake up tasks in the correct order Currently, we wake up the tasks by priority queue ordering, which means that we ignore the batching that is supposed to help with QoS issues. Fixes: c049f8ea9a0d ("SUNRPC: Remove the bh-safe lock requirement on the rpc_wait_queue->lock") Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 67 +++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f06d7c315017..cf702a5f7fe5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -675,6 +675,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_wake_up_next); +/** + * rpc_wake_up_locked - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + */ +static void rpc_wake_up_locked(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_locked(queue, task); + } +} + /** * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping @@ -683,26 +700,29 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next); */ void rpc_wake_up(struct rpc_wait_queue *queue) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_locked(queue); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up); +/** + * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + */ +static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_set_status_locked(queue, task, status); + } +} + /** * rpc_wake_up_status - wake up all rpc_tasks and set their status value. * @queue: rpc_wait_queue on which the tasks are sleeping @@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up); */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - task->tk_status = status; - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_status_locked(queue, status); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); From 05ad917561fca39a03338cb21fe9622f998b0f9c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 6 Nov 2020 16:03:38 -0500 Subject: [PATCH 21/90] NFSv4.2: condition READDIR's mask for security label based on LSM state Currently, the client will always ask for security_labels if the server returns that it supports that feature regardless of any LSM modules (such as Selinux) enforcing security policy. This adds performance penalty to the READDIR operation. Client adjusts superblock's support of the security_label based on the server's support but also current client's configuration of the LSM modules. Thus, prior to using the default bitmask in READDIR, this patch checks the server's capabilities and then instructs READDIR to remove FATTR4_WORD2_SECURITY_LABEL from the bitmask. v5: fixing silly mistakes of the rushed v4 v4: simplifying logic v3: changing label's initialization per Ondrej's comment v2: dropping selinux hook and using the sb cap. Suggested-by: Ondrej Mosnacek Suggested-by: Scott Mayhew Signed-off-by: Olga Kornievskaia Fixes: 2b0143b5c986 ("VFS: normal filesystems (and lustre): d_inode() annotations") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a5b9356bee6a..66f1f4a5c74c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4965,12 +4965,12 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, u64 cookie, struct page **pages, unsigned int count, bool plus) { struct inode *dir = d_inode(dentry); + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), .pages = pages, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, .plus = plus, }; struct nfs4_readdir_res res; @@ -4985,9 +4985,15 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, dentry, (unsigned long long)cookie); + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + args.bitmask = server->attr_bitmask_nl; + else + args.bitmask = server->attr_bitmask; + nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); res.pgbase = args.pgbase; - status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 0); if (status >= 0) { memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; From 2e7a46417952ae480cb5091ed5ade73078630b40 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 09:56:18 -0500 Subject: [PATCH 22/90] NFS: Ensure contents of struct nfs_open_dir_context are consistent Ensure that the contents of struct nfs_open_dir_context are consistent by setting them under the file->f_lock from a private copy (that is known to be consistent). Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4e011adaf967..67d8595cd6e5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -144,20 +144,23 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[]; }; -typedef struct { +typedef struct nfs_readdir_descriptor { struct file *file; struct page *page; struct dir_context *ctx; unsigned long page_index; - u64 *dir_cookie; + u64 dir_cookie; u64 last_cookie; + u64 dup_cookie; loff_t current_index; loff_t prev_index; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; + unsigned long attr_gencount; unsigned int cache_entry_index; + signed char duped; bool plus; bool eof; } nfs_readdir_descriptor_t; @@ -273,7 +276,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri } index = (unsigned int)diff; - *desc->dir_cookie = array->array[index].cookie; + desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: @@ -298,33 +301,32 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int status = -EAGAIN; for (i = 0; i < array->size; i++) { - if (array->array[i].cookie == *desc->dir_cookie) { + if (array->array[i].cookie == desc->dir_cookie) { struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - struct nfs_open_dir_context *ctx = desc->file->private_data; new_pos = desc->current_index + i; - if (ctx->attr_gencount != nfsi->attr_gencount || + if (desc->attr_gencount != nfsi->attr_gencount || !nfs_readdir_inode_mapping_valid(nfsi)) { - ctx->duped = 0; - ctx->attr_gencount = nfsi->attr_gencount; + desc->duped = 0; + desc->attr_gencount = nfsi->attr_gencount; } else if (new_pos < desc->prev_index) { - if (ctx->duped > 0 - && ctx->dup_cookie == *desc->dir_cookie) { + if (desc->duped > 0 + && desc->dup_cookie == desc->dir_cookie) { if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " "The file: %.*s has duplicate cookie %llu\n", desc->file, array->array[i].string.len, - array->array[i].string.name, *desc->dir_cookie); + array->array[i].string.name, desc->dir_cookie); } status = -ELOOP; goto out; } - ctx->dup_cookie = *desc->dir_cookie; - ctx->duped = -1; + desc->dup_cookie = desc->dir_cookie; + desc->duped = -1; } if (nfs_readdir_use_cookie(desc->file)) - desc->ctx->pos = *desc->dir_cookie; + desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos = new_pos; desc->prev_index = new_pos; @@ -334,7 +336,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des } if (array->eof_index >= 0) { status = -EBADCOOKIE; - if (*desc->dir_cookie == array->last_cookie) + if (desc->dir_cookie == array->last_cookie) desc->eof = true; } out: @@ -349,7 +351,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) array = kmap(desc->page); - if (*desc->dir_cookie == 0) + if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); @@ -801,7 +803,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) int i = 0; int res = 0; struct nfs_cache_array *array = NULL; - struct nfs_open_dir_context *ctx = file->private_data; array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { @@ -814,22 +815,22 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) break; } if (i < (array->size-1)) - *desc->dir_cookie = array->array[i+1].cookie; + desc->dir_cookie = array->array[i+1].cookie; else - *desc->dir_cookie = array->last_cookie; + desc->dir_cookie = array->last_cookie; if (nfs_readdir_use_cookie(file)) - desc->ctx->pos = *desc->dir_cookie; + desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (ctx->duped != 0) - ctx->duped = 1; + if (desc->duped != 0) + desc->duped = 1; } if (array->eof_index >= 0) desc->eof = true; kunmap(desc->page); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", - (unsigned long long)*desc->dir_cookie, res); + (unsigned long long)desc->dir_cookie, res); return res; } @@ -851,10 +852,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) struct page *page = NULL; int status; struct inode *inode = file_inode(desc->file); - struct nfs_open_dir_context *ctx = desc->file->private_data; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + (unsigned long long)desc->dir_cookie); page = alloc_page(GFP_HIGHUSER); if (!page) { @@ -863,9 +863,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) } desc->page_index = 0; - desc->last_cookie = *desc->dir_cookie; + desc->last_cookie = desc->dir_cookie; desc->page = page; - ctx->duped = 0; + desc->duped = 0; status = nfs_readdir_xdr_to_array(desc, page, inode); if (status < 0) @@ -894,7 +894,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) nfs_readdir_descriptor_t my_desc = { .file = file, .ctx = ctx, - .dir_cookie = &dir_ctx->dir_cookie, .plus = nfs_use_readdirplus(inode, ctx), }, *desc = &my_desc; @@ -915,13 +914,20 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) goto out; + spin_lock(&file->f_lock); + desc->dir_cookie = dir_ctx->dir_cookie; + desc->dup_cookie = dir_ctx->dup_cookie; + desc->duped = dir_ctx->duped; + desc->attr_gencount = dir_ctx->attr_gencount; + spin_unlock(&file->f_lock); + do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && !desc->eof) { + if (desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) @@ -946,6 +952,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; } while (!desc->eof); + + spin_lock(&file->f_lock); + dir_ctx->dir_cookie = desc->dir_cookie; + dir_ctx->dup_cookie = desc->dup_cookie; + dir_ctx->duped = desc->duped; + dir_ctx->attr_gencount = desc->attr_gencount; + spin_unlock(&file->f_lock); + out: if (res > 0) res = 0; From b1e21c97437f64d0a00f8fea1f9e64e77e0e4242 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 13:45:55 -0500 Subject: [PATCH 23/90] NFS: Clean up readdir struct nfs_cache_array Since the 'eof_index' is only ever used as a flag, make it so. Also add a flag to detect if the page has been completely filled. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 66 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 67d8595cd6e5..604ebe015387 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -138,9 +138,10 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { - int size; - int eof_index; u64 last_cookie; + unsigned int size; + unsigned char page_full : 1, + page_is_eof : 1; struct nfs_cache_array_entry array[]; }; @@ -172,7 +173,6 @@ void nfs_readdir_init_array(struct page *page) array = kmap_atomic(page); memset(array, 0, sizeof(struct nfs_cache_array)); - array->eof_index = -1; kunmap_atomic(array); } @@ -192,6 +192,17 @@ void nfs_readdir_clear_array(struct page *page) kunmap_atomic(array); } +static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) +{ + array->page_is_eof = 1; + array->page_full = 1; +} + +static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) +{ + return array->page_full; +} + /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in @@ -213,6 +224,23 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le return 0; } +/* + * Check that the next array entry lies entirely within the page bounds + */ +static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) +{ + struct nfs_cache_array_entry *cache_entry; + + if (array->page_full) + return -ENOSPC; + cache_entry = &array->array[array->size + 1]; + if ((char *)cache_entry - (char *)array > PAGE_SIZE) { + array->page_full = 1; + return -ENOSPC; + } + return 0; +} + static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { @@ -220,13 +248,11 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) struct nfs_cache_array_entry *cache_entry; int ret; - cache_entry = &array->array[array->size]; - - /* Check that this entry lies within the page bounds */ - ret = -ENOSPC; - if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + ret = nfs_readdir_array_can_expand(array); + if (ret) goto out; + cache_entry = &array->array[array->size]; cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; @@ -236,12 +262,21 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) array->last_cookie = entry->cookie; array->size++; if (entry->eof != 0) - array->eof_index = array->size; + nfs_readdir_array_set_eof(array); out: kunmap(page); return ret; } +static void nfs_readdir_page_set_eof(struct page *page) +{ + struct nfs_cache_array *array; + + array = kmap_atomic(page); + nfs_readdir_array_set_eof(array); + kunmap_atomic(array); +} + static inline int is_32bit_api(void) { @@ -270,7 +305,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index >= 0) + if (array->page_is_eof) goto out_eof; return -EAGAIN; } @@ -334,7 +369,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des return 0; } } - if (array->eof_index >= 0) { + if (array->page_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) desc->eof = true; @@ -566,7 +601,6 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct xdr_stream stream; struct xdr_buf buf; struct page *scratch; - struct nfs_cache_array *array; unsigned int count = 0; int status; @@ -604,10 +638,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); - array->eof_index = array->size; + nfs_readdir_page_set_eof(page); status = 0; - kunmap(page); } put_page(scratch); @@ -689,7 +721,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, status = 0; break; } - } while (array->eof_index < 0); + } while (!nfs_readdir_array_is_full(array)); nfs_readdir_free_pages(pages, array_size); out_release_array: @@ -825,7 +857,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) if (desc->duped != 0) desc->duped = 1; } - if (array->eof_index >= 0) + if (array->page_is_eof) desc->eof = true; kunmap(desc->page); From 972bcdf233096d36b2f3e02f34a80d0f073b6b05 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 17:15:43 -0500 Subject: [PATCH 24/90] NFS: Clean up nfs_readdir_page_filler() Clean up handling of the case where there are no entries in the readdir reply. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 604ebe015387..68acbde3f914 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -601,16 +601,12 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct xdr_stream stream; struct xdr_buf buf; struct page *scratch; - unsigned int count = 0; int status; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) return -ENOMEM; - if (buflen == 0) - goto out_nopages; - xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -619,27 +615,27 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en entry->label->len = NFS4_MAXLABELLEN; status = xdr_decode(desc, entry, &stream); - if (status != 0) { - if (status == -EAGAIN) - status = 0; + if (status != 0) break; - } - - count++; if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); status = nfs_readdir_add_to_array(entry, page); - if (status != 0) - break; - } while (!entry->eof); + } while (!status && !entry->eof); -out_nopages: - if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - nfs_readdir_page_set_eof(page); + switch (status) { + case -EBADCOOKIE: + if (entry->eof) { + nfs_readdir_page_set_eof(page); + status = 0; + } + break; + case -ENOSPC: + case -EAGAIN: status = 0; + break; } put_page(scratch); @@ -714,14 +710,15 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (status < 0) break; + pglen = status; - status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); - if (status < 0) { - if (status == -ENOSPC) - status = 0; + if (pglen == 0) { + nfs_readdir_page_set_eof(page); break; } - } while (!nfs_readdir_array_is_full(array)); + + status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); + } while (!status && !nfs_readdir_array_is_full(array)); nfs_readdir_free_pages(pages, array_size); out_release_array: From 1f1d4aa4e4bcb4721d3c51f4c07dda790b6accd9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 12:34:43 -0500 Subject: [PATCH 25/90] NFS: Clean up directory array handling Refactor to use pagecache_get_page() so that we can fill the page in multiple stages. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 140 ++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 68acbde3f914..842f69120a01 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -149,7 +149,7 @@ typedef struct nfs_readdir_descriptor { struct file *file; struct page *page; struct dir_context *ctx; - unsigned long page_index; + pgoff_t page_index; u64 dir_cookie; u64 last_cookie; u64 dup_cookie; @@ -166,13 +166,18 @@ typedef struct nfs_readdir_descriptor { bool eof; } nfs_readdir_descriptor_t; -static -void nfs_readdir_init_array(struct page *page) +static void nfs_readdir_array_init(struct nfs_cache_array *array) +{ + memset(array, 0, sizeof(struct nfs_cache_array)); +} + +static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) { struct nfs_cache_array *array; array = kmap_atomic(page); - memset(array, 0, sizeof(struct nfs_cache_array)); + nfs_readdir_array_init(array); + array->last_cookie = last_cookie; kunmap_atomic(array); } @@ -188,7 +193,7 @@ void nfs_readdir_clear_array(struct page *page) array = kmap_atomic(page); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); - array->size = 0; + nfs_readdir_array_init(array); kunmap_atomic(array); } @@ -268,6 +273,44 @@ out: return ret; } +static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, + pgoff_t index, u64 last_cookie) +{ + struct page *page; + + page = grab_cache_page(mapping, index); + if (page && !PageUptodate(page)) { + nfs_readdir_page_init_array(page, last_cookie); + if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) + nfs_zap_mapping(mapping->host, mapping); + SetPageUptodate(page); + } + + return page; +} + +static u64 nfs_readdir_page_last_cookie(struct page *page) +{ + struct nfs_cache_array *array; + u64 ret; + + array = kmap_atomic(page); + ret = array->last_cookie; + kunmap_atomic(array); + return ret; +} + +static bool nfs_readdir_page_needs_filling(struct page *page) +{ + struct nfs_cache_array *array; + bool ret; + + array = kmap_atomic(page); + ret = !nfs_readdir_array_is_full(array); + kunmap_atomic(array); + return ret; +} + static void nfs_readdir_page_set_eof(struct page *page) { struct nfs_cache_array *array; @@ -682,10 +725,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); - nfs_readdir_init_array(page); - entry.prev_cookie = 0; - entry.cookie = desc->last_cookie; + entry.cookie = nfs_readdir_page_last_cookie(page); entry.eof = 0; entry.fh = nfs_alloc_fhandle(); entry.fattr = nfs_alloc_fattr(); @@ -730,48 +771,25 @@ out: return status; } -/* - * Now we cache directories properly, by converting xdr information - * to an array that can be used for lookups later. This results in - * fewer cache pages, since we can store more information on each page. - * We only need to convert from xdr once so future lookups are much simpler - */ -static -int nfs_readdir_filler(void *data, struct page* page) -{ - nfs_readdir_descriptor_t *desc = data; - struct inode *inode = file_inode(desc->file); - int ret; - - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; - SetPageUptodate(page); - - if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { - /* Should never happen */ - nfs_zap_mapping(inode, inode->i_mapping); - } - unlock_page(page); - return 0; - error: - nfs_readdir_clear_array(page); - unlock_page(page); - return ret; -} - -static -void cache_page_release(nfs_readdir_descriptor_t *desc) +static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc) { put_page(desc->page); desc->page = NULL; } -static -struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +static void +nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { - return read_cache_page(desc->file->f_mapping, desc->page_index, - nfs_readdir_filler, desc); + unlock_page(desc->page); + nfs_readdir_page_put(desc); +} + +static struct page * +nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) +{ + return nfs_readdir_page_get_locked(desc->file->f_mapping, + desc->page_index, + desc->last_cookie); } /* @@ -785,23 +803,21 @@ int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc) struct nfs_inode *nfsi = NFS_I(inode); int res; - desc->page = get_cache_page(desc); - if (IS_ERR(desc->page)) - return PTR_ERR(desc->page); - res = lock_page_killable(desc->page); - if (res != 0) - goto error; - res = -EAGAIN; - if (desc->page->mapping != NULL) { - res = nfs_readdir_search_array(desc); - if (res == 0) { - nfsi->page_index = desc->page_index; - return 0; - } + desc->page = nfs_readdir_page_get_cached(desc); + if (!desc->page) + return -ENOMEM; + if (nfs_readdir_page_needs_filling(desc->page)) { + res = nfs_readdir_xdr_to_array(desc, desc->page, inode); + if (res < 0) + goto error; + } + res = nfs_readdir_search_array(desc); + if (res == 0) { + nfsi->page_index = desc->page_index; + return 0; } - unlock_page(desc->page); error: - cache_page_release(desc); + nfs_readdir_page_unlock_and_put_cached(desc); return res; } @@ -896,6 +912,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) desc->page = page; desc->duped = 0; + nfs_readdir_page_init_array(page, desc->dir_cookie); status = nfs_readdir_xdr_to_array(desc, page, inode); if (status < 0) goto out_release; @@ -904,7 +921,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) out_release: nfs_readdir_clear_array(desc->page); - cache_page_release(desc); + nfs_readdir_page_put(desc); out: dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); @@ -976,8 +993,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; res = nfs_do_filldir(desc); - unlock_page(desc->page); - cache_page_release(desc); + nfs_readdir_page_unlock_and_put_cached(desc); if (res < 0) break; } while (!desc->eof); From 3b2a09f127e025674945e82c1ec0c88d6740280e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 13:14:10 -0500 Subject: [PATCH 26/90] NFS: Don't discard readdir results If a readdir call returns more data than we can fit into one page cache page, then allocate a new one for that data rather than discarding the data. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 842f69120a01..f7248145c333 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -320,6 +320,26 @@ static void nfs_readdir_page_set_eof(struct page *page) kunmap_atomic(array); } +static void nfs_readdir_page_unlock_and_put(struct page *page) +{ + unlock_page(page); + put_page(page); +} + +static struct page *nfs_readdir_page_get_next(struct address_space *mapping, + pgoff_t index, u64 cookie) +{ + struct page *page; + + page = nfs_readdir_page_get_locked(mapping, index, cookie); + if (page) { + if (nfs_readdir_page_last_cookie(page) == cookie) + return page; + nfs_readdir_page_unlock_and_put(page); + } + return NULL; +} + static inline int is_32bit_api(void) { @@ -637,13 +657,15 @@ out: } /* Perform conversion from xdr to cache array */ -static -int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - struct page **xdr_pages, struct page *page, unsigned int buflen) +static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct page **xdr_pages, + struct page *fillme, unsigned int buflen) { + struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct page *scratch, *new, *page = fillme; int status; scratch = alloc_page(GFP_KERNEL); @@ -666,6 +688,19 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en desc->dir_verifier); status = nfs_readdir_add_to_array(entry, page); + if (status != -ENOSPC) + continue; + + if (page->mapping != mapping) + break; + new = nfs_readdir_page_get_next(mapping, page->index + 1, + entry->prev_cookie); + if (!new) + break; + if (page != fillme) + nfs_readdir_page_unlock_and_put(page); + page = new; + status = nfs_readdir_add_to_array(entry, page); } while (!status && !entry->eof); switch (status) { @@ -681,6 +716,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } + if (page != fillme) + nfs_readdir_page_unlock_and_put(page); + put_page(scratch); return status; } From e762a639816015a70bb1af8ea4baf54f4facb591 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 13:31:20 -0500 Subject: [PATCH 27/90] NFS: Remove unnecessary kmap in nfs_readdir_xdr_to_array() The kmapped pointer is only used once per loop to check if we need to exit. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f7248145c333..e8b0fcc1bc9e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -759,7 +759,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct page *pages[NFS_MAX_READDIR_PAGES]; struct nfs_entry entry; struct file *file = desc->file; - struct nfs_cache_array *array; int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); @@ -778,11 +777,9 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; } - array = kmap(page); - status = nfs_readdir_alloc_pages(pages, array_size); if (status < 0) - goto out_release_array; + goto out_release_label; do { unsigned int pglen; status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); @@ -797,11 +794,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, } status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); - } while (!status && !nfs_readdir_array_is_full(array)); + } while (!status && nfs_readdir_page_needs_filling(page)); nfs_readdir_free_pages(pages, array_size); -out_release_array: - kunmap(page); +out_release_label: nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); From ed09222d651dbd30e707f96180628229146b885c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 13:34:32 -0500 Subject: [PATCH 28/90] NFS: Replace kmap() with kmap_atomic() in nfs_readdir_search_array() Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e8b0fcc1bc9e..b9001123ec84 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -447,7 +447,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array; int status; - array = kmap(desc->page); + array = kmap_atomic(desc->page); if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -459,7 +459,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - kunmap(desc->page); + kunmap_atomic(array); return status; } From a52a8a6adad99e1162c27f70cd6495626a48064d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 19:17:29 -0500 Subject: [PATCH 29/90] NFS: Simplify struct nfs_cache_array_entry We don't need to store a hash, so replace struct qstr with a simple const char pointer and length. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b9001123ec84..be0e2891fecc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -133,7 +133,8 @@ nfs_closedir(struct inode *inode, struct file *filp) struct nfs_cache_array_entry { u64 cookie; u64 ino; - struct qstr string; + const char *name; + unsigned int name_len; unsigned char d_type; }; @@ -192,7 +193,7 @@ void nfs_readdir_clear_array(struct page *page) array = kmap_atomic(page); for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); + kfree(array->array[i].name); nfs_readdir_array_init(array); kunmap_atomic(array); } @@ -213,20 +214,17 @@ static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ -static -int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { - string->len = len; - string->name = kmemdup_nul(name, len, GFP_KERNEL); - if (string->name == NULL) - return -ENOMEM; + const char *ret = kmemdup_nul(name, len, GFP_KERNEL); + /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ - kmemleak_not_leak(string->name); - string->hash = full_name_hash(NULL, name, len); - return 0; + if (ret != NULL) + kmemleak_not_leak(ret); + return ret; } /* @@ -249,27 +247,34 @@ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = kmap(page); + struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; + const char *name; int ret; + name = nfs_readdir_copy_name(entry->name, entry->len); + if (!name) + return -ENOMEM; + + array = kmap_atomic(page); ret = nfs_readdir_array_can_expand(array); - if (ret) + if (ret) { + kfree(name); goto out; + } cache_entry = &array->array[array->size]; cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; - ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); - if (ret) - goto out; + cache_entry->name_len = entry->len; + cache_entry->name = name; array->last_cookie = entry->cookie; array->size++; if (entry->eof != 0) nfs_readdir_array_set_eof(array); out: - kunmap(page); + kunmap_atomic(array); return ret; } @@ -413,9 +418,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " - "The file: %.*s has duplicate cookie %llu\n", - desc->file, array->array[i].string.len, - array->array[i].string.name, desc->dir_cookie); + "The file: %s has duplicate cookie %llu\n", + desc->file, array->array[i].name, desc->dir_cookie); } status = -ELOOP; goto out; @@ -888,7 +892,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) struct nfs_cache_array_entry *ent; ent = &array->array[i]; - if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eof = true; break; From 1a34c8c9a49ee10ccaf91091ddd98c25e4d567dd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 14:26:47 -0500 Subject: [PATCH 30/90] NFS: Support larger readdir buffers Support readdir buffers of up to 1MB in size so that we can read large directories using few RPC calls. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/client.c | 4 ++-- fs/nfs/dir.c | 33 +++++++++++++++++++-------------- fs/nfs/internal.h | 6 ------ 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4b8cc93913f7..f6454ba53d05 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -781,8 +781,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES) - server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > NFS_MAX_FILE_IO_SIZE) + server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index be0e2891fecc..438906dae083 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -727,44 +727,47 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, return status; } -static -void nfs_readdir_free_pages(struct page **pages, unsigned int npages) +static void nfs_readdir_free_pages(struct page **pages, size_t npages) { - unsigned int i; - for (i = 0; i < npages; i++) - put_page(pages[i]); + while (npages--) + put_page(pages[npages]); + kfree(pages); } /* * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call * to nfs_readdir_free_pages() */ -static -int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) +static struct page **nfs_readdir_alloc_pages(size_t npages) { - unsigned int i; + struct page **pages; + size_t i; + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } - return 0; + return pages; out_freepages: nfs_readdir_free_pages(pages, i); - return -ENOMEM; + return NULL; } static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { - struct page *pages[NFS_MAX_READDIR_PAGES]; + struct page **pages; struct nfs_entry entry; struct file *file = desc->file; + size_t array_size; + size_t dtsize = NFS_SERVER(inode)->dtsize; int status = -ENOMEM; - unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; entry.cookie = nfs_readdir_page_last_cookie(page); @@ -781,9 +784,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; } - status = nfs_readdir_alloc_pages(pages, array_size); - if (status < 0) + array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = nfs_readdir_alloc_pages(array_size); + if (!pages) goto out_release_label; + do { unsigned int pglen; status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6673a77884d9..b840d0a91c9d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -56,12 +56,6 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) -/* - * Maximum number of pages that readdir can use for creating - * a vmapped array of pages. - */ -#define NFS_MAX_READDIR_PAGES 8 - struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ From 93b8959a0a8cf1b1a493efee9e8328681e111862 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 15:24:41 -0500 Subject: [PATCH 31/90] NFS: More readdir cleanups Remove the redundant caching of the credential in struct nfs_open_dir_context. Pass the buffer size as an argument to nfs_readdir_xdr_filler(). Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 25 +++++++++++-------------- include/linux/nfs_fs.h | 1 - 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 438906dae083..bc366bd8e8f3 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred) +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; @@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->attr_gencount = nfsi->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; - ctx->cred = get_cred(cred); spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont spin_lock(&dir->i_lock); list_del(&ctx->list); spin_unlock(&dir->i_lock); - put_cred(ctx->cred); kfree(ctx); } @@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - ctx = alloc_nfs_open_dir_context(inode, current_cred()); + ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; @@ -468,12 +466,12 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) } /* Fill a page with xdr information before transferring to the cache page */ -static -int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, - struct nfs_entry *entry, struct file *file, struct inode *inode) +static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, + u64 cookie, struct page **pages, + size_t bufsize) { - struct nfs_open_dir_context *ctx = file->private_data; - const struct cred *cred = ctx->cred; + struct file *file = desc->file; + struct inode *inode = file_inode(file); unsigned long timestamp, gencount; int error; @@ -481,8 +479,8 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); - error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, - NFS_SERVER(inode)->dtsize, desc->plus); + error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred, + cookie, pages, bufsize, desc->plus); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { @@ -764,7 +762,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, { struct page **pages; struct nfs_entry entry; - struct file *file = desc->file; size_t array_size; size_t dtsize = NFS_SERVER(inode)->dtsize; int status = -ENOMEM; @@ -791,8 +788,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, do { unsigned int pglen; - status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); - + status = nfs_readdir_xdr_filler(desc, entry.cookie, + pages, dtsize); if (status < 0) break; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a2c6455ea3fa..dd6b463dda80 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -88,7 +88,6 @@ struct nfs_open_context { struct nfs_open_dir_context { struct list_head list; - const struct cred *cred; unsigned long attr_gencount; __u64 dir_cookie; __u64 dup_cookie; From dbeaf8c984ca689c2c0966c41bd78dee178b5dfe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Nov 2020 18:20:03 -0500 Subject: [PATCH 32/90] NFS: nfs_do_filldir() does not return a value Clean up nfs_do_filldir(). Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bc366bd8e8f3..48856cee10de 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -881,13 +881,11 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) /* * Once we've found the start of the dirent within a page: fill 'er up... */ -static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc) +static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) { struct file *file = desc->file; - int i = 0; - int res = 0; - struct nfs_cache_array *array = NULL; + struct nfs_cache_array *array; + unsigned int i = 0; array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { @@ -914,9 +912,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) desc->eof = true; kunmap(desc->page); - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", - (unsigned long long)desc->dir_cookie, res); - return res; + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", + (unsigned long long)desc->dir_cookie); } /* @@ -957,7 +954,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) if (status < 0) goto out_release; - status = nfs_do_filldir(desc); + nfs_do_filldir(desc); out_release: nfs_readdir_clear_array(desc->page); @@ -1032,10 +1029,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; - res = nfs_do_filldir(desc); + nfs_do_filldir(desc); nfs_readdir_page_unlock_and_put_cached(desc); - if (res < 0) - break; } while (!desc->eof); spin_lock(&file->f_lock); @@ -1046,8 +1041,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) spin_unlock(&file->f_lock); out: - if (res > 0) - res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } From 6b75cf9e309d18664f964889ac026096ba0d1919 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Nov 2020 08:55:03 -0500 Subject: [PATCH 33/90] NFS: Reduce readdir stack usage The descriptor and the struct nfs_entry are both large structures, so don't allocate them from the stack. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 58 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 48856cee10de..c3af54640f6c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -761,23 +761,24 @@ static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { struct page **pages; - struct nfs_entry entry; + struct nfs_entry *entry; size_t array_size; size_t dtsize = NFS_SERVER(inode)->dtsize; int status = -ENOMEM; - entry.prev_cookie = 0; - entry.cookie = nfs_readdir_page_last_cookie(page); - entry.eof = 0; - entry.fh = nfs_alloc_fhandle(); - entry.fattr = nfs_alloc_fattr(); - entry.server = NFS_SERVER(inode); - if (entry.fh == NULL || entry.fattr == NULL) + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->cookie = nfs_readdir_page_last_cookie(page); + entry->fh = nfs_alloc_fhandle(); + entry->fattr = nfs_alloc_fattr(); + entry->server = NFS_SERVER(inode); + if (entry->fh == NULL || entry->fattr == NULL) goto out; - entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(entry.label)) { - status = PTR_ERR(entry.label); + entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry->label)) { + status = PTR_ERR(entry->label); goto out; } @@ -788,7 +789,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, do { unsigned int pglen; - status = nfs_readdir_xdr_filler(desc, entry.cookie, + status = nfs_readdir_xdr_filler(desc, entry->cookie, pages, dtsize); if (status < 0) break; @@ -799,15 +800,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, break; } - status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); + status = nfs_readdir_page_filler(desc, entry, pages, page, pglen); } while (!status && nfs_readdir_page_needs_filling(page)); nfs_readdir_free_pages(pages, array_size); out_release_label: - nfs4_label_free(entry.label); + nfs4_label_free(entry->label); out: - nfs_free_fattr(entry.fattr); - nfs_free_fhandle(entry.fh); + nfs_free_fattr(entry->fattr); + nfs_free_fhandle(entry->fh); + kfree(entry); return status; } @@ -974,13 +976,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct nfs_open_dir_context *dir_ctx = file->private_data; - nfs_readdir_descriptor_t my_desc = { - .file = file, - .ctx = ctx, - .plus = nfs_use_readdirplus(inode, ctx), - }, - *desc = &my_desc; - int res = 0; + struct nfs_readdir_descriptor *desc; + int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -992,10 +989,19 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) + if (res < 0) + goto out; + } + + res = -ENOMEM; + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) goto out; + desc->file = file; + desc->ctx = ctx; + desc->plus = nfs_use_readdirplus(inode, ctx); spin_lock(&file->f_lock); desc->dir_cookie = dir_ctx->dir_cookie; @@ -1040,6 +1046,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->attr_gencount = desc->attr_gencount; spin_unlock(&file->f_lock); + kfree(desc); + out: dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; From 6c981eff23b894ce429281dc45a5589359eef2c1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 3 Nov 2020 07:42:04 -0500 Subject: [PATCH 34/90] NFS: Cleanup to remove nfs_readdir_descriptor_t typedef Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c3af54640f6c..b226f6f3ae96 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -144,7 +144,7 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[]; }; -typedef struct nfs_readdir_descriptor { +struct nfs_readdir_descriptor { struct file *file; struct page *page; struct dir_context *ctx; @@ -163,7 +163,7 @@ typedef struct nfs_readdir_descriptor { signed char duped; bool plus; bool eof; -} nfs_readdir_descriptor_t; +}; static void nfs_readdir_array_init(struct nfs_cache_array *array) { @@ -362,8 +362,8 @@ bool nfs_readdir_use_cookie(const struct file *filp) return true; } -static -int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; @@ -394,8 +394,8 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); } -static -int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { int i; loff_t new_pos; @@ -443,8 +443,7 @@ out: return status; } -static -int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; @@ -497,7 +496,7 @@ error: return error; } -static int xdr_decode(nfs_readdir_descriptor_t *desc, +static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { struct inode *inode = file_inode(desc->file); @@ -757,8 +756,8 @@ out_freepages: return NULL; } -static -int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, + struct page *page, struct inode *inode) { struct page **pages; struct nfs_entry *entry; @@ -838,8 +837,7 @@ nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) * Returns 0 if desc->dir_cookie was found on page desc->page_index * and locks the page to prevent removal from the page cache. */ -static -int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc) +static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); @@ -864,8 +862,7 @@ error: } /* Search for desc->dir_cookie from the beginning of the page cache */ -static inline -int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; @@ -930,8 +927,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) * we should already have a complete representation of the * directory in the page cache by the time we get here. */ -static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc) +static int uncached_readdir(struct nfs_readdir_descriptor *desc) { struct page *page = NULL; int status; From 82e22a5e6245873779db1607d3b0fec6f9ca07d0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Nov 2020 17:34:23 -0500 Subject: [PATCH 35/90] NFS: Allow the NFS generic code to pass in a verifier to readdir If we're ever going to allow support for servers that use the readdir verifier, then that use needs to be managed by the middle layers as those need to be able to reject cookies from other verifiers. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 23 ++++++++++++++++++----- fs/nfs/nfs3proc.c | 35 +++++++++++++++++------------------ fs/nfs/nfs4proc.c | 36 +++++++++++++++++------------------- fs/nfs/proc.c | 18 +++++++++--------- include/linux/nfs_xdr.h | 17 +++++++++++++++-- 5 files changed, 76 insertions(+), 53 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b226f6f3ae96..3ee0668a9719 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -469,8 +469,20 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, u64 cookie, struct page **pages, size_t bufsize) { - struct file *file = desc->file; - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(desc->file); + __be32 verf_res[2]; + struct nfs_readdir_arg arg = { + .dentry = file_dentry(desc->file), + .cred = desc->file->f_cred, + .verf = NFS_I(inode)->cookieverf, + .cookie = cookie, + .pages = pages, + .page_len = bufsize, + .plus = desc->plus, + }; + struct nfs_readdir_res res = { + .verf = verf_res, + }; unsigned long timestamp, gencount; int error; @@ -478,20 +490,21 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); - error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred, - cookie, pages, bufsize, desc->plus); + error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); - desc->plus = false; + desc->plus = arg.plus = false; goto again; } goto error; } desc->timestamp = timestamp; desc->gencount = gencount; + memcpy(NFS_I(inode)->cookieverf, res.verf, + sizeof(NFS_I(inode)->cookieverf)); error: return error; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 6b66b73a50eb..5c4e23abc345 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -662,37 +662,36 @@ out: * Also note that this implementation handles both plain readdir and * readdirplus. */ -static int -nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); - __be32 *verf = NFS_I(dir)->cookieverf; + struct inode *dir = d_inode(nr_arg->dentry); struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .verf = {verf[0], verf[1]}, - .plus = plus, - .count = count, - .pages = pages + .cookie = nr_arg->cookie, + .plus = nr_arg->plus, + .count = nr_arg->page_len, + .pages = nr_arg->pages }; struct nfs3_readdirres res = { - .verf = verf, - .plus = plus + .verf = nr_res->verf, + .plus = nr_arg->plus, }; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status = -ENOMEM; - if (plus) + if (nr_arg->plus) msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; + if (arg.cookie) + memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf)); - dprintk("NFS call readdir%s %d\n", - plus? "plus" : "", (unsigned int) cookie); + dprintk("NFS call readdir%s %llu\n", nr_arg->plus ? "plus" : "", + (unsigned long long)nr_arg->cookie); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) @@ -705,8 +704,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred, nfs_free_fattr(res.dir_attr); out: - dprintk("NFS reply readdir%s: %d\n", - plus? "plus" : "", status); + dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "", + status); return status; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 66f1f4a5c74c..adcaba68eaed 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4961,41 +4961,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); struct nfs_server *server = NFS_SERVER(dir); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = pages, + .pages = nr_arg->pages, .pgbase = 0, - .count = count, - .plus = plus, + .count = nr_arg->page_len, + .plus = nr_arg->plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, - dentry, - (unsigned long long)cookie); + dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__, + nr_arg->dentry, (unsigned long long)nr_arg->cookie); if (!(server->caps & NFS_CAP_SECURITY_LABEL)) args.bitmask = server->attr_bitmask_nl; else args.bitmask = server->attr_bitmask; - nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); + nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { - memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; } @@ -5005,19 +5004,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, return status; } -static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs4_proc_readdir(struct nfs_readdir_arg *arg, + struct nfs_readdir_res *res) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_proc_readdir(dentry, cred, cookie, - pages, count, plus); - trace_nfs4_readdir(d_inode(dentry), err); - err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, - &exception); + err = _nfs4_proc_readdir(arg, res); + trace_nfs4_readdir(d_inode(arg->dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)), + err, &exception); } while (exception.retry); return err; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 15c865cc837f..73ab7c59d3a7 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name) * sure it is syntactically correct; the entries itself are decoded * from nfs_readdir by calling the decode_entry function directly. */ -static int -nfs_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .count = count, - .pages = pages, + .cookie = nr_arg->cookie, + .count = nr_arg->page_len, + .pages = nr_arg->pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], .rpc_argp = &arg, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("NFS call readdir %d\n", (unsigned int)cookie); + dprintk("NFS call readdir %llu\n", (unsigned long long)nr_arg->cookie); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nr_res->verf[0] = nr_res->verf[1] = 0; nfs_invalidate_atime(dir); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d63cb862d58e..3327239fa2f9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -750,6 +750,20 @@ struct nfs_entry { struct nfs_server * server; }; +struct nfs_readdir_arg { + struct dentry *dentry; + const struct cred *cred; + __be32 *verf; + u64 cookie; + struct page **pages; + unsigned int page_len; + bool plus; +}; + +struct nfs_readdir_res { + __be32 *verf; +}; + /* * The following types are for NFSv2 only. */ @@ -1744,8 +1758,7 @@ struct nfs_rpc_ops { unsigned int, struct iattr *); int (*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, const struct qstr *); - int (*readdir) (struct dentry *, const struct cred *, - u64, struct page **, unsigned int, bool); + int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *); int (*mknod) (struct inode *, struct dentry *, struct iattr *, dev_t); int (*statfs) (struct nfs_server *, struct nfs_fh *, From 9fff59ed4c4d239125f8529a9971c46defd2e2b0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Nov 2020 20:11:32 -0500 Subject: [PATCH 36/90] NFS: Handle NFS4ERR_NOT_SAME and NFSERR_BADCOOKIE from readdir calls If the server returns NFS4ERR_NOT_SAME or tells us that the cookie is bad in response to a READDIR call, then we should empty the page cache so that we can fill it from scratch again. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 24 ++++++++++++++++-------- fs/nfs/nfs4proc.c | 2 ++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3ee0668a9719..3b44bef3a1b4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -861,15 +861,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) return -ENOMEM; if (nfs_readdir_page_needs_filling(desc->page)) { res = nfs_readdir_xdr_to_array(desc, desc->page, inode); - if (res < 0) - goto error; + if (res < 0) { + nfs_readdir_page_unlock_and_put_cached(desc); + if (res == -EBADCOOKIE || res == -ENOTSYNC) { + invalidate_inode_pages2(desc->file->f_mapping); + desc->page_index = 0; + return -EAGAIN; + } + return res; + } } res = nfs_readdir_search_array(desc); if (res == 0) { nfsi->page_index = desc->page_index; return 0; } -error: nfs_readdir_page_unlock_and_put_cached(desc); return res; } @@ -879,12 +885,12 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; - if (desc->page_index == 0) { - desc->current_index = 0; - desc->prev_index = 0; - desc->last_cookie = 0; - } do { + if (desc->page_index == 0) { + desc->current_index = 0; + desc->prev_index = 0; + desc->last_cookie = 0; + } res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; @@ -1030,6 +1036,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) res = uncached_readdir(desc); if (res == 0) continue; + if (res == -EBADCOOKIE || res == -ENOTSYNC) + res = 0; } break; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index adcaba68eaed..7ab40d0e6a74 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -184,6 +184,8 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_FILE_OPEN: return -EBUSY; + case -NFS4ERR_NOT_SAME: + return -ENOTSYNC; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); From b593c09f83a2732a0f0298c8f3468236a83cdd9f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 2 Nov 2020 20:06:12 -0500 Subject: [PATCH 37/90] NFS: Improve handling of directory verifiers If the server insists on using the readdir verifiers in order to allow cookies to expire, then we should ensure that we cache the verifier with the cookie, so that we can return an error if the application tries to use the expired cookie. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 35 +++++++++++++++++++++++------------ fs/nfs/inode.c | 7 ------- include/linux/nfs_fs.h | 8 +++++++- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3b44bef3a1b4..454377228167 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -155,6 +155,7 @@ struct nfs_readdir_descriptor { loff_t current_index; loff_t prev_index; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; @@ -466,15 +467,15 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) /* Fill a page with xdr information before transferring to the cache page */ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, - u64 cookie, struct page **pages, - size_t bufsize) + __be32 *verf, u64 cookie, + struct page **pages, size_t bufsize, + __be32 *verf_res) { struct inode *inode = file_inode(desc->file); - __be32 verf_res[2]; struct nfs_readdir_arg arg = { .dentry = file_dentry(desc->file), .cred = desc->file->f_cred, - .verf = NFS_I(inode)->cookieverf, + .verf = verf, .cookie = cookie, .pages = pages, .page_len = bufsize, @@ -503,8 +504,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, } desc->timestamp = timestamp; desc->gencount = gencount; - memcpy(NFS_I(inode)->cookieverf, res.verf, - sizeof(NFS_I(inode)->cookieverf)); error: return error; } @@ -770,11 +769,13 @@ out_freepages: } static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, - struct page *page, struct inode *inode) + struct page *page, __be32 *verf_arg, + __be32 *verf_res) { struct page **pages; struct nfs_entry *entry; size_t array_size; + struct inode *inode = file_inode(desc->file); size_t dtsize = NFS_SERVER(inode)->dtsize; int status = -ENOMEM; @@ -801,8 +802,9 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, do { unsigned int pglen; - status = nfs_readdir_xdr_filler(desc, entry->cookie, - pages, dtsize); + status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, + pages, dtsize, + verf_res); if (status < 0) break; @@ -854,13 +856,15 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); + __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; desc->page = nfs_readdir_page_get_cached(desc); if (!desc->page) return -ENOMEM; if (nfs_readdir_page_needs_filling(desc->page)) { - res = nfs_readdir_xdr_to_array(desc, desc->page, inode); + res = nfs_readdir_xdr_to_array(desc, desc->page, + nfsi->cookieverf, verf); if (res < 0) { nfs_readdir_page_unlock_and_put_cached(desc); if (res == -EBADCOOKIE || res == -ENOTSYNC) { @@ -870,6 +874,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) } return res; } + memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); } res = nfs_readdir_search_array(desc); if (res == 0) { @@ -902,6 +907,7 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) { struct file *file = desc->file; + struct nfs_inode *nfsi = NFS_I(file_inode(file)); struct nfs_cache_array *array; unsigned int i = 0; @@ -915,6 +921,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) desc->eof = true; break; } + memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf)); if (i < (array->size-1)) desc->dir_cookie = array->array[i+1].cookie; else @@ -949,8 +956,8 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) static int uncached_readdir(struct nfs_readdir_descriptor *desc) { struct page *page = NULL; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; int status; - struct inode *inode = file_inode(desc->file); dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)desc->dir_cookie); @@ -967,7 +974,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) desc->duped = 0; nfs_readdir_page_init_array(page, desc->dir_cookie); - status = nfs_readdir_xdr_to_array(desc, page, inode); + status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf); if (status < 0) goto out_release; @@ -1023,6 +1030,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->dup_cookie = dir_ctx->dup_cookie; desc->duped = dir_ctx->duped; desc->attr_gencount = dir_ctx->attr_gencount; + memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); spin_unlock(&file->f_lock); do { @@ -1061,6 +1069,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) dir_ctx->dup_cookie = desc->dup_cookie; dir_ctx->duped = desc->duped; dir_ctx->attr_gencount = desc->attr_gencount; + memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); spin_unlock(&file->f_lock); kfree(desc); @@ -1101,6 +1110,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->dir_cookie = offset; else dir_ctx->dir_cookie = 0; + if (offset == 0) + memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; } spin_unlock(&filp->f_lock); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index aa6493905bbe..9b765a900b28 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -229,7 +229,6 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA @@ -1237,7 +1236,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { - struct nfs_inode *nfsi = NFS_I(inode); int ret; if (mapping->nrpages != 0) { @@ -1250,11 +1248,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (ret < 0) return ret; } - if (S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); - } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index dd6b463dda80..681ed98e4ba8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -45,6 +45,11 @@ */ #define NFS_RPC_SWAPFLAGS (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) +/* + * Size of the NFS directory verifier + */ +#define NFS_DIR_VERIFIER_SIZE 2 + /* * NFSv3/v4 Access mode cache entry */ @@ -89,6 +94,7 @@ struct nfs_open_context { struct nfs_open_dir_context { struct list_head list; unsigned long attr_gencount; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 dup_cookie; signed char duped; @@ -156,7 +162,7 @@ struct nfs_inode { * This is the cookie verifier used for NFSv3 readdir * operations */ - __be32 cookieverf[2]; + __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; atomic_long_t nrequests; struct nfs_mds_commit_info commit_info; From 762567b7c798afd08c22811ecfc66885a2b50f91 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 4 Nov 2020 08:32:19 -0500 Subject: [PATCH 38/90] NFS: Optimisations for monotonically increasing readdir cookies If the server is handing out monotonically increasing readdir cookie values, then we can optimise away searches through pages that contain cookies that lie outside our search range. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 454377228167..b6c3501e8f61 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -140,7 +140,8 @@ struct nfs_cache_array { u64 last_cookie; unsigned int size; unsigned char page_full : 1, - page_is_eof : 1; + page_is_eof : 1, + cookies_are_ordered : 1; struct nfs_cache_array_entry array[]; }; @@ -178,6 +179,7 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) array = kmap_atomic(page); nfs_readdir_array_init(array); array->last_cookie = last_cookie; + array->cookies_are_ordered = 1; kunmap_atomic(array); } @@ -269,6 +271,8 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) cache_entry->name_len = entry->len; cache_entry->name = name; array->last_cookie = entry->cookie; + if (array->last_cookie <= cache_entry->cookie) + array->cookies_are_ordered = 0; array->size++; if (entry->eof != 0) nfs_readdir_array_set_eof(array); @@ -395,6 +399,19 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); } +static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, + u64 cookie) +{ + if (!array->cookies_are_ordered) + return true; + /* Optimisation for monotonically increasing cookies */ + if (cookie >= array->last_cookie) + return false; + if (array->size && cookie < array->array[0].cookie) + return false; + return true; +} + static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, struct nfs_readdir_descriptor *desc) { @@ -402,6 +419,9 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, loff_t new_pos; int status = -EAGAIN; + if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) + goto check_eof; + for (i = 0; i < array->size; i++) { if (array->array[i].cookie == desc->dir_cookie) { struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); @@ -435,6 +455,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, return 0; } } +check_eof: if (array->page_is_eof) { status = -EBADCOOKIE; if (desc->dir_cookie == array->last_cookie) From 35df59d3ef693292840a61cdb04b39d8c9412f4e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Nov 2020 20:38:47 -0500 Subject: [PATCH 39/90] NFS: Reduce number of RPC calls when doing uncached readdir If we're doing uncached readdir, allocate multiple pages in order to try to avoid duplicate RPC calls for the same getdents() call. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 105 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b6c3501e8f61..93c6f22cdc62 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -199,6 +199,23 @@ void nfs_readdir_clear_array(struct page *page) kunmap_atomic(array); } +static struct page * +nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) +{ + struct page *page = alloc_page(gfp_flags); + if (page) + nfs_readdir_page_init_array(page, last_cookie); + return page; +} + +static void nfs_readdir_page_array_free(struct page *page) +{ + if (page) { + nfs_readdir_clear_array(page); + put_page(page); + } +} + static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) { array->page_is_eof = 1; @@ -694,12 +711,14 @@ out: static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct page **xdr_pages, - struct page *fillme, unsigned int buflen) + unsigned int buflen, + struct page **arrays, + size_t narrays) { struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch, *new, *page = fillme; + struct page *scratch, *new, *page = *arrays; int status; scratch = alloc_page(GFP_KERNEL); @@ -725,15 +744,25 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, if (status != -ENOSPC) continue; - if (page->mapping != mapping) - break; - new = nfs_readdir_page_get_next(mapping, page->index + 1, - entry->prev_cookie); - if (!new) - break; - if (page != fillme) - nfs_readdir_page_unlock_and_put(page); - page = new; + if (page->mapping != mapping) { + if (!--narrays) + break; + new = nfs_readdir_page_array_alloc(entry->prev_cookie, + GFP_KERNEL); + if (!new) + break; + arrays++; + *arrays = page = new; + } else { + new = nfs_readdir_page_get_next(mapping, + page->index + 1, + entry->prev_cookie); + if (!new) + break; + if (page != *arrays) + nfs_readdir_page_unlock_and_put(page); + page = new; + } status = nfs_readdir_add_to_array(entry, page); } while (!status && !entry->eof); @@ -750,7 +779,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, break; } - if (page != fillme) + if (page != *arrays) nfs_readdir_page_unlock_and_put(page); put_page(scratch); @@ -790,10 +819,11 @@ out_freepages: } static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, - struct page *page, __be32 *verf_arg, - __be32 *verf_res) + __be32 *verf_arg, __be32 *verf_res, + struct page **arrays, size_t narrays) { struct page **pages; + struct page *page = *arrays; struct nfs_entry *entry; size_t array_size; struct inode *inode = file_inode(desc->file); @@ -835,7 +865,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, break; } - status = nfs_readdir_page_filler(desc, entry, pages, page, pglen); + status = nfs_readdir_page_filler(desc, entry, pages, pglen, + arrays, narrays); } while (!status && nfs_readdir_page_needs_filling(page)); nfs_readdir_free_pages(pages, array_size); @@ -884,8 +915,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) if (!desc->page) return -ENOMEM; if (nfs_readdir_page_needs_filling(desc->page)) { - res = nfs_readdir_xdr_to_array(desc, desc->page, - nfsi->cookieverf, verf); + res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, + &desc->page, 1); if (res < 0) { nfs_readdir_page_unlock_and_put_cached(desc); if (res == -EBADCOOKIE || res == -ENOTSYNC) { @@ -976,37 +1007,39 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) */ static int uncached_readdir(struct nfs_readdir_descriptor *desc) { - struct page *page = NULL; + struct page **arrays; + size_t i, sz = 512; __be32 verf[NFS_DIR_VERIFIER_SIZE]; - int status; + int status = -ENOMEM; - dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", (unsigned long long)desc->dir_cookie); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - status = -ENOMEM; + arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); + if (!arrays) + goto out; + arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL); + if (!arrays[0]) goto out; - } desc->page_index = 0; desc->last_cookie = desc->dir_cookie; - desc->page = page; desc->duped = 0; - nfs_readdir_page_init_array(page, desc->dir_cookie); - status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf); - if (status < 0) - goto out_release; + status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); - nfs_do_filldir(desc); + for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + desc->page = arrays[i]; + nfs_do_filldir(desc); + } + desc->page = NULL; - out_release: - nfs_readdir_clear_array(desc->page); - nfs_readdir_page_put(desc); - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", - __func__, status); + + for (i = 0; i < sz && arrays[i]; i++) + nfs_readdir_page_array_free(arrays[i]); +out: + kfree(arrays); + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } From 794092c57f89c2c833da00f82f38a0afcb4033bc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Nov 2020 20:47:05 -0500 Subject: [PATCH 40/90] NFS: Do uncached readdir when we're seeking a cookie in an empty page cache If the directory is changing, causing the page cache to get invalidated while we are listing the contents, then the NFS client is currently forced to read in the entire directory contents from scratch, because it needs to perform a linear search for the readdir cookie. While this is not an issue for small directories, it does not scale to directories with millions of entries. In order to be able to deal with large directories that are changing, add a heuristic to ensure that if the page cache is empty, and we are searching for a cookie that is not the zero cookie, we just default to performing uncached readdir. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Tested-by: Dave Wysochanski --- fs/nfs/dir.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 93c6f22cdc62..3f70697729d8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -937,11 +937,28 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) return res; } +static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc) +{ + struct address_space *mapping = desc->file->f_mapping; + struct inode *dir = file_inode(desc->file); + unsigned int dtsize = NFS_SERVER(dir)->dtsize; + loff_t size = i_size_read(dir); + + /* + * Default to uncached readdir if the page cache is empty, and + * we're looking for a non-zero cookie in a large directory. + */ + return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize; +} + /* Search for desc->dir_cookie from the beginning of the page cache */ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; + if (nfs_readdir_dont_search_cache(desc)) + return -EBADCOOKIE; + do { if (desc->page_index == 0) { desc->current_index = 0; From d5aa6b22e2258f05317313ecc02efbb988ed6d38 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Nov 2020 16:33:38 -0500 Subject: [PATCH 41/90] SUNRPC: xprt_load_transport() needs to support the netid "rdma6" According to RFC5666, the correct netid for an IPv6 addressed RDMA transport is "rdma6", which we've supported as a mount option since Linux-4.7. The problem is when we try to load the module "xprtrdma6", that will fail, since there is no modulealias of that name. Fixes: 181342c5ebe8 ("xprtrdma: Add rdma6 option to support NFS/RDMA IPv6") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 65 +++++++++++++++++++++++++-------- net/sunrpc/xprtrdma/module.c | 1 + net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtsock.c | 4 ++ 5 files changed, 56 insertions(+), 16 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a603d48d2b2c..3ac5037d1c3d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -330,6 +330,7 @@ struct xprt_class { struct rpc_xprt * (*setup)(struct xprt_create *); struct module *owner; char name[32]; + const char * netid[]; }; /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f6c17e75f20e..57f09ea3ef2a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -151,31 +151,64 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); +static void +xprt_class_release(const struct xprt_class *t) +{ + module_put(t->owner); +} + +static const struct xprt_class * +xprt_class_find_by_netid_locked(const char *netid) +{ + const struct xprt_class *t; + unsigned int i; + + list_for_each_entry(t, &xprt_list, list) { + for (i = 0; t->netid[i][0] != '\0'; i++) { + if (strcmp(t->netid[i], netid) != 0) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_netid(const char *netid) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + if (!t) { + spin_unlock(&xprt_list_lock); + request_module("rpc%s", netid); + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + } + spin_unlock(&xprt_list_lock); + return t; +} + /** * xprt_load_transport - load a transport implementation - * @transport_name: transport to load + * @netid: transport to load * * Returns: * 0: transport successfully loaded * -ENOENT: transport module not available */ -int xprt_load_transport(const char *transport_name) +int xprt_load_transport(const char *netid) { - struct xprt_class *t; - int result; + const struct xprt_class *t; - result = 0; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (strcmp(t->name, transport_name) == 0) { - spin_unlock(&xprt_list_lock); - goto out; - } - } - spin_unlock(&xprt_list_lock); - result = request_module("xprt%s", transport_name); -out: - return result; + t = xprt_class_find_by_netid(netid); + if (!t) + return -ENOENT; + xprt_class_release(t); + return 0; } EXPORT_SYMBOL_GPL(xprt_load_transport); diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 620327c01302..45c5b41ac8dc 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("svcrdma"); MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); static void __exit rpc_rdma_cleanup(void) { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8915e42240d3..035060c05fd5 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -768,6 +768,7 @@ static struct xprt_class xprt_rdma = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_RDMA, .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, }; void xprt_rdma_cleanup(void) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7090bbee0ec5..c93ff70da3f9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -3059,6 +3059,7 @@ static struct xprt_class xs_local_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, + .netid = { "" }, }; static struct xprt_class xs_udp_transport = { @@ -3067,6 +3068,7 @@ static struct xprt_class xs_udp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, + .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { @@ -3075,6 +3077,7 @@ static struct xprt_class xs_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, + .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { @@ -3083,6 +3086,7 @@ static struct xprt_class xs_bc_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, + .netid = { "" }, }; /** From 9bccd264611b5345d85138dc7fd55bdeb9e6942e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Nov 2020 12:58:22 -0500 Subject: [PATCH 42/90] SUNRPC: Close a race with transport setup and module put After we've looked up the transport module, we need to ensure it can't go away until we've finished running the transport setup code. Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 57f09ea3ef2a..bf490d0c98c6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -157,6 +157,32 @@ xprt_class_release(const struct xprt_class *t) module_put(t->owner); } +static const struct xprt_class * +xprt_class_find_by_ident_locked(int ident) +{ + const struct xprt_class *t; + + list_for_each_entry(t, &xprt_list, list) { + if (t->ident != ident) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_ident(int ident) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_ident_locked(ident); + spin_unlock(&xprt_list_lock); + return t; +} + static const struct xprt_class * xprt_class_find_by_netid_locked(const char *netid) { @@ -1929,21 +1955,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; - struct xprt_class *t; + const struct xprt_class *t; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (t->ident == args->ident) { - spin_unlock(&xprt_list_lock); - goto found; - } + t = xprt_class_find_by_ident(args->ident); + if (!t) { + dprintk("RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); } - spin_unlock(&xprt_list_lock); - dprintk("RPC: transport (%d) not supported\n", args->ident); - return ERR_PTR(-EIO); -found: xprt = t->setup(args); + xprt_class_release(t); + if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) From 1fc5f13186440973e1aa1d85aa263326756af431 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Nov 2020 09:41:21 -0500 Subject: [PATCH 43/90] SUNRPC: Add a helper to return the transport identifier given a netid Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 31 ++++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 3ac5037d1c3d..f7b75c72f80e 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -386,6 +386,7 @@ xprt_disable_swap(struct rpc_xprt *xprt) int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); int xprt_load_transport(const char *); +int xprt_find_transport_ident(const char *); void xprt_wait_for_reply_request_def(struct rpc_task *task); void xprt_wait_for_reply_request_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index bf490d0c98c6..23452f57d369 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -218,6 +218,28 @@ xprt_class_find_by_netid(const char *netid) return t; } +/** + * xprt_find_transport_ident - convert a netid into a transport identifier + * @netid: transport to load + * + * Returns: + * > 0: transport identifier + * -ENOENT: transport module not available + */ +int xprt_find_transport_ident(const char *netid) +{ + const struct xprt_class *t; + int ret; + + t = xprt_class_find_by_netid(netid); + if (!t) + return -ENOENT; + ret = t->ident; + xprt_class_release(t); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_find_transport_ident); + /** * xprt_load_transport - load a transport implementation * @netid: transport to load @@ -228,13 +250,8 @@ xprt_class_find_by_netid(const char *netid) */ int xprt_load_transport(const char *netid) { - const struct xprt_class *t; - - t = xprt_class_find_by_netid(netid); - if (!t) - return -ENOENT; - xprt_class_release(t); - return 0; + int ret = xprt_find_transport_ident(netid); + return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(xprt_load_transport); From 1c3695d0bb3869fedcde4538d831003519576ece Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Nov 2020 10:30:35 -0500 Subject: [PATCH 44/90] NFS: Switch mount code to use xprt_find_transport_ident() Switch the mount code to use xprt_find_transport_ident() and to check the results before allowing the mount to proceed. Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 29ec8b09a52d..06894bcdea2d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -510,13 +510,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_tcp: - ctx->flags |= NFS_MOUNT_TCP; - ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; - break; case Opt_rdma: ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */ - ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(param->key); + ret = xprt_find_transport_ident(param->key); + if (ret < 0) + goto out_bad_transport; + ctx->nfs_server.protocol = ret; break; case Opt_acl: if (result.negated) @@ -670,11 +669,13 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, case Opt_xprt_rdma: /* vector side protocols to TCP */ ctx->flags |= NFS_MOUNT_TCP; - ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(param->string); + ret = xprt_find_transport_ident(param->string); + if (ret < 0) + goto out_bad_transport; + ctx->nfs_server.protocol = ret; break; default: - return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); + goto out_bad_transport; } ctx->protofamily = protofamily; @@ -697,7 +698,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_xprt_rdma: /* not used for side protocols */ default: - return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); + goto out_bad_transport; } ctx->mountfamily = mountfamily; break; @@ -787,6 +788,8 @@ out_invalid_address: return nfs_invalf(fc, "NFS: Bad IP address specified"); out_of_bounds: return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key); +out_bad_transport: + return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); } /* From c87b056e58e71ba7a3f603700618f8da9742aa29 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Nov 2020 10:32:14 -0500 Subject: [PATCH 45/90] SUNRPC: Remove unused function xprt_load_transport() Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 - net/sunrpc/xprt.c | 15 --------------- 2 files changed, 16 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index f7b75c72f80e..d2e97ee802af 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -385,7 +385,6 @@ xprt_disable_swap(struct rpc_xprt *xprt) */ int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); -int xprt_load_transport(const char *); int xprt_find_transport_ident(const char *); void xprt_wait_for_reply_request_def(struct rpc_task *task); void xprt_wait_for_reply_request_rtt(struct rpc_task *task); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 23452f57d369..691ccf8049a4 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -240,21 +240,6 @@ int xprt_find_transport_ident(const char *netid) } EXPORT_SYMBOL_GPL(xprt_find_transport_ident); -/** - * xprt_load_transport - load a transport implementation - * @netid: transport to load - * - * Returns: - * 0: transport successfully loaded - * -ENOENT: transport module not available - */ -int xprt_load_transport(const char *netid) -{ - int ret = xprt_find_transport_ident(netid); - return ret < 0 ? ret : 0; -} -EXPORT_SYMBOL_GPL(xprt_load_transport); - static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; From a12f996d3413ae41b6c0952013cd7a11396e14eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Nov 2020 16:58:53 -0500 Subject: [PATCH 46/90] NFSv4/pNFS: Use connections to a DS that are all of the same protocol family If the pNFS metadata server advertises multiple addresses for the same data server, we should try to connect to just one protocol family and transport type on the assumption that homogeneity will improve performance. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 679767ac258d..ee6d003db844 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -860,6 +860,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, }; + + if (da->da_addr.ss_family != clp->cl_addr.ss_family) + continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); @@ -913,17 +916,19 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, }; struct nfs4_add_xprt_data xprtdata = { .clp = clp, - .cred = nfs4_get_clid_cred(clp), }; struct rpc_add_xprt_test rpcdata = { .add_xprt_test = clp->cl_mvops->session_trunk, .data = &xprtdata, }; + if (da->da_addr.ss_family != clp->cl_addr.ss_family) + continue; /** * Test this address for session trunking and * add as an alias */ + xprtdata.cred = nfs4_get_clid_cred(clp), rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); From 190c75a31fe65e311f3628bd97bd58cff50d221f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Nov 2020 09:21:48 -0500 Subject: [PATCH 47/90] pNFS: Add helpers for allocation/free of struct nfs4_pnfs_ds_addr Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index ee6d003db844..c3c04a763985 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -661,6 +661,20 @@ _data_server_lookup_locked(const struct list_head *dsaddrs) return NULL; } +static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da = kzalloc(sizeof(*da), gfp_flags); + if (da) + INIT_LIST_HEAD(&da->da_node); + return da; +} + +static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da) +{ + kfree(da->da_remotestr); + kfree(da); +} + static void destroy_ds(struct nfs4_pnfs_ds *ds) { struct nfs4_pnfs_ds_addr *da; @@ -676,8 +690,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) struct nfs4_pnfs_ds_addr, da_node); list_del_init(&da->da_node); - kfree(da->da_remotestr); - kfree(da); + nfs4_pnfs_ds_addr_free(da); } kfree(ds->ds_remotestr); @@ -1094,12 +1107,10 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) } *portstr = '\0'; - da = kzalloc(sizeof(*da), gfp_flags); + da = nfs4_pnfs_ds_addr_alloc(gfp_flags); if (unlikely(!da)) goto out_free_buf; - INIT_LIST_HEAD(&da->da_node); - if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, sizeof(da->da_addr))) { dprintk("%s: error parsing address %s\n", __func__, buf); From 4be78d26810bc506769773ca2f81b4de65f43fd5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 6 Nov 2020 16:10:52 -0500 Subject: [PATCH 48/90] NFSv4/pNFS: Store the transport type in struct nfs4_pnfs_ds_addr We want to enable RDMA and UDP as valid transport methods if a GETDEVICEINFO call specifies it. Do so by adding a parser for the netid that translates it to an appropriate argument for the RPC transport layer. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 2 ++ fs/nfs/pnfs_nfs.c | 34 +++++++++++++++++++--------------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2661c44c62db..f618c49697bb 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -51,6 +51,8 @@ struct nfs4_pnfs_ds_addr { size_t da_addrlen; struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *da_remotestr; /* human readable addr+port */ + const char *da_netid; + int da_transport; }; struct nfs4_pnfs_ds { diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index c3c04a763985..7a97643acf3f 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -672,6 +672,7 @@ static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags) static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da) { kfree(da->da_remotestr); + kfree(da->da_netid); kfree(da); } @@ -867,13 +868,15 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, if (!IS_ERR(clp)) { struct xprt_create xprt_args = { - .ident = XPRT_TRANSPORT_TCP, + .ident = da->da_transport, .net = clp->cl_net, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, }; + if (da->da_transport != clp->cl_proto) + continue; if (da->da_addr.ss_family != clp->cl_addr.ss_family) continue; /* Add this address as an alias */ @@ -883,7 +886,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, } clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, + da->da_addrlen, da->da_transport, timeo, retrans); if (IS_ERR(clp)) continue; @@ -921,7 +924,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { struct xprt_create xprt_args = { - .ident = XPRT_TRANSPORT_TCP, + .ident = da->da_transport, .net = clp->cl_net, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, @@ -935,6 +938,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, .data = &xprtdata, }; + if (da->da_transport != clp->cl_proto) + continue; if (da->da_addr.ss_family != clp->cl_addr.ss_family) continue; /** @@ -950,8 +955,9 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, } else { clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version); + da->da_addrlen, + da->da_transport, timeo, + retrans, minor_version); if (IS_ERR(clp)) continue; @@ -1042,8 +1048,8 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) int nlen, rlen; int tmp[2]; __be32 *p; - char *netid, *match_netid; - size_t len, match_netid_len; + char *netid; + size_t len; char *startsep = ""; char *endsep = ""; @@ -1125,15 +1131,11 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) case AF_INET: ((struct sockaddr_in *)&da->da_addr)->sin_port = port; da->da_addrlen = sizeof(struct sockaddr_in); - match_netid = "tcp"; - match_netid_len = 3; break; case AF_INET6: ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; da->da_addrlen = sizeof(struct sockaddr_in6); - match_netid = "tcp6"; - match_netid_len = 4; startsep = "["; endsep = "]"; break; @@ -1144,12 +1146,15 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) goto out_free_da; } - if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { - dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", - __func__, netid, match_netid); + da->da_transport = xprt_find_transport_ident(netid); + if (da->da_transport < 0) { + dprintk("%s: ERROR: unknown r_netid \"%s\"\n", + __func__, netid); goto out_free_da; } + da->da_netid = netid; + /* save human readable address */ len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; da->da_remotestr = kzalloc(len, gfp_flags); @@ -1161,7 +1166,6 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); kfree(buf); - kfree(netid); return da; out_free_da: From 9a7016319e1e7c6348a960931182f5f71b95f24e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Nov 2020 15:42:03 -0500 Subject: [PATCH 49/90] pNFS/flexfiles: Fix up layoutstats reporting for non-TCP transports Ensure that we report the correct netid when using UDP or RDMA transports to the DSes. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 24bf5797f88a..b7c26836b2cb 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -2284,7 +2284,6 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) struct sockaddr *sap = (struct sockaddr *)&da->da_addr; char portbuf[RPCBIND_MAXUADDRPLEN]; char addrbuf[RPCBIND_MAXUADDRLEN]; - char *netid; unsigned short port; int len, netid_len; __be32 *p; @@ -2294,18 +2293,13 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) return; port = ntohs(((struct sockaddr_in *)sap)->sin_port); - netid = "tcp"; - netid_len = 3; break; case AF_INET6: if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) return; port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); - netid = "tcp6"; - netid_len = 4; break; default: - /* we only support tcp and tcp6 */ WARN_ON_ONCE(1); return; } @@ -2313,8 +2307,9 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + netid_len = strlen(da->da_netid); p = xdr_reserve_space(xdr, 4 + netid_len); - xdr_encode_opaque(p, netid, netid_len); + xdr_encode_opaque(p, da->da_netid, netid_len); p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, addrbuf, len); From 4aceaaea5eccd32bc40c6c76b262489b2f53ca8d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Nov 2020 10:56:53 -0500 Subject: [PATCH 50/90] SUNRPC: Fix up open coded kmemdup_nul() Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 71e03b930b70..b1cda6d85ded 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1942,10 +1942,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { - char *s = kmalloc(ret + 1, gfp_flags); + char *s = kmemdup_nul(p, ret, gfp_flags); if (s != NULL) { - memcpy(s, p, ret); - s[ret] = '\0'; *str = s; return strlen(s); } From 988998134996a397a47cf758627def5f20dc1e88 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 9 Nov 2020 16:06:15 -0500 Subject: [PATCH 51/90] pNFS: Clean up open coded xdr string decoding Use the existing xdr_stream_decode_string_dup() to safely decode into kmalloced strings. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 43 +++++++------------------------------------ 1 file changed, 7 insertions(+), 36 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7a97643acf3f..2efcfdd348a1 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -1045,9 +1045,8 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; __be16 port; - int nlen, rlen; + ssize_t nlen, rlen; int tmp[2]; - __be32 *p; char *netid; size_t len; char *startsep = ""; @@ -1055,45 +1054,17 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) /* r_netid */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + nlen = xdr_stream_decode_string_dup(xdr, &netid, XDR_MAX_NETOBJ, + gfp_flags); + if (unlikely(nlen < 0)) goto out_err; - nlen = be32_to_cpup(p++); - - p = xdr_inline_decode(xdr, nlen); - if (unlikely(!p)) - goto out_err; - - netid = kmalloc(nlen+1, gfp_flags); - if (unlikely(!netid)) - goto out_err; - - netid[nlen] = '\0'; - memcpy(netid, p, nlen); /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_free_netid; - rlen = be32_to_cpup(p); - - p = xdr_inline_decode(xdr, rlen); - if (unlikely(!p)) - goto out_free_netid; - /* port is ".ABC.DEF", 8 chars max */ - if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { - dprintk("%s: Invalid address, length %d\n", __func__, - rlen); + rlen = xdr_stream_decode_string_dup(xdr, &buf, INET6_ADDRSTRLEN + + IPV6_SCOPE_ID_LEN + 8, gfp_flags); + if (unlikely(rlen < 0)) goto out_free_netid; - } - buf = kmalloc(rlen + 1, gfp_flags); - if (!buf) { - dprintk("%s: Not enough memory\n", __func__); - goto out_free_netid; - } - buf[rlen] = '\0'; - memcpy(buf, p, rlen); /* replace port '.' with '-' */ portstr = strrchr(buf, '.'); From 046e5ccb4198b990190e11fb52fd9cfd264402eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Nov 2020 21:42:16 -0500 Subject: [PATCH 52/90] NFSv4: Fix the alignment of page data in the getdeviceinfo reply We can fit the device_addr4 opaque data padding in the pages. Fixes: cf500bac8fd4 ("SUNRPC: Introduce rpc_prepare_reply_pages()") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c6dbfcae7517..c16b93df1bc1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3009,15 +3009,19 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); + + replen = hdr.replen + op_decode_hdr_maxsz; + encode_getdeviceinfo(xdr, args, &hdr); - /* set up reply kvec. Subtract notification bitmap max size (2) - * so that notification bitmap is put in xdr_buf tail */ + /* set up reply kvec. device_addr4 opaque data is read into the + * pages */ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen, hdr.replen - 2); + args->pdev->pglen, replen + 2 + 1); encode_nops(&hdr); } From 2b1f83d108bd35d12d8a833298d2a033f9121aac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Nov 2020 14:01:50 -0500 Subject: [PATCH 53/90] SUNRPC: Fix up typo in xdr_init_decode() We already know that the head buffer and page are empty, so if there is any data, it is in the tail. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index b1cda6d85ded..bc7a622016ee 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1060,7 +1060,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, else if (buf->page_len != 0) xdr_set_page_base(xdr, 0, buf->len); else - xdr_set_iov(xdr, buf->head, buf->len); + xdr_set_iov(xdr, buf->tail, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; From 8d86e373b0ef52d091ced9583ffbb33ad2771576 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Nov 2020 14:50:43 -0500 Subject: [PATCH 54/90] SUNRPC: Clean up helpers xdr_set_iov() and xdr_set_page_base() Allow xdr_set_iov() to set a base so that we can use it to set the cursor to a specific position in the kvec buffer. If the new base overflows the kvec/pages buffer in either xdr_set_iov() or xdr_set_page_base(), then truncate it so that we point to the end of the buffer. Finally, change both function to return the number of bytes remaining to read in their buffers. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index bc7a622016ee..394297ec1cb9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -970,19 +970,22 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b } EXPORT_SYMBOL_GPL(xdr_write_pages); -static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, - unsigned int len) +static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, + unsigned int base, unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; - xdr->p = (__be32*)iov->iov_base; + if (unlikely(base > len)) + base = len; + xdr->p = (__be32*)(iov->iov_base + base); xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; + return len - base; } -static int xdr_set_page_base(struct xdr_stream *xdr, - unsigned int base, unsigned int len) +static unsigned int xdr_set_page_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) { unsigned int pgnr; unsigned int maxlen; @@ -991,9 +994,11 @@ static int xdr_set_page_base(struct xdr_stream *xdr, void *kaddr; maxlen = xdr->buf->page_len; - if (base >= maxlen) - return -EINVAL; - maxlen -= base; + if (base >= maxlen) { + base = maxlen; + maxlen = 0; + } else + maxlen -= base; if (len > maxlen) len = maxlen; @@ -1011,14 +1016,14 @@ static int xdr_set_page_base(struct xdr_stream *xdr, pgend = PAGE_SIZE; xdr->end = (__be32*)(kaddr + pgend); xdr->iov = NULL; - return 0; + return len; } static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, unsigned int len) { - if (xdr_set_page_base(xdr, base, len) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); + if (xdr_set_page_base(xdr, base, len) == 0) + xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr)); } static void xdr_set_next_page(struct xdr_stream *xdr) @@ -1055,12 +1060,9 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, xdr->scratch.iov_base = NULL; xdr->scratch.iov_len = 0; xdr->nwords = XDR_QUADLEN(buf->len); - if (buf->head[0].iov_len != 0) - xdr_set_iov(xdr, buf->head, buf->len); - else if (buf->page_len != 0) - xdr_set_page_base(xdr, 0, buf->len); - else - xdr_set_iov(xdr, buf->tail, buf->len); + if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && + xdr_set_page_base(xdr, 0, buf->len) == 0) + xdr_set_iov(xdr, buf->tail, 0, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; From 1d97316692f708de755655ac1cfd704d7a55843f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Nov 2020 16:31:03 -0500 Subject: [PATCH 55/90] SUNRPC: Fix up xdr_read_pages() to take arbitrary object lengths Fix up xdr_read_pages() so that it can handle object lengths that are larger than the page length, by simply aligning to the next object in the buffer tail. The function will continue to return the length of the truncate object data that actually fit into the pages. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 394297ec1cb9..3ce0a5daa9eb 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1219,44 +1219,33 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) } /** - * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * xdr_read_pages - align page-based XDR data to current pointer position * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer - * into the page list. Any data that lies beyond current position + "len" - * bytes is moved into the XDR tail[]. + * into the page list. Any data that lies beyond current position + @len + * bytes is moved into the XDR tail[]. The xdr_stream current position is + * then advanced past that data to align to the next XDR object in the tail. * * Returns the number of XDR encoded bytes now contained in the pages */ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { - struct xdr_buf *buf = xdr->buf; - struct kvec *iov; - unsigned int nwords; - unsigned int end; - unsigned int padding; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int base, end, pglen; - len = xdr_align_pages(xdr, len); - if (len == 0) + pglen = xdr_align_pages(xdr, nwords << 2); + if (pglen == 0) return 0; - nwords = XDR_QUADLEN(len); - padding = (nwords << 2) - len; - xdr->iov = iov = buf->tail; - /* Compute remaining message length. */ - end = ((xdr->nwords - nwords) << 2) + padding; - if (end > iov->iov_len) - end = iov->iov_len; - /* - * Position current pointer at beginning of tail, and - * set remaining message length. - */ - xdr->p = (__be32 *)((char *)iov->iov_base + padding); - xdr->end = (__be32 *)((char *)iov->iov_base + end); - xdr->page_ptr = NULL; - xdr->nwords = XDR_QUADLEN(end - padding); - return len; + xdr->nwords -= nwords; + base = (nwords << 2) - pglen; + end = xdr_stream_remaining(xdr) - pglen; + + if (xdr_set_iov(xdr, xdr->buf->tail, base, end) == 0) + xdr->nwords = 0; + return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); From 9ed5af268e88f6e5b65376be98d652b37cb20d7b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Nov 2020 20:46:18 -0500 Subject: [PATCH 56/90] SUNRPC: Clean up the handling of page padding in rpc_prepare_reply_pages() rpc_prepare_reply_pages() currently expects the 'hdrsize' argument to contain the length of the data that we expect to want placed in the head kvec plus a count of 1 word of padding that is placed after the page data. This is very confusing when trying to read the code, and sometimes leads to callers adding an arbitrary value of '1' just in order to satisfy the requirement (whether or not the page data actually needs such padding). This patch aims to clarify the code by changing the 'hdrsize' argument to remove that 1 word of padding. This means we need to subtract the padding from all the existing callers. Fixes: 02ef04e432ba ("NFS: Account for XDR pad of buf->pages") Signed-off-by: Trond Myklebust --- fs/nfs/nfs2xdr.c | 19 ++++++++++--------- fs/nfs/nfs3xdr.c | 29 ++++++++++++++++------------- fs/nfs/nfs4xdr.c | 36 +++++++++++++++++++----------------- net/sunrpc/clnt.c | 5 +---- net/sunrpc/xdr.c | 3 --- 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index f6676af37d5d..7fba7711e6b3 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -34,6 +34,7 @@ * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ +#define NFS_pagepad_sz (1) /* Page padding */ #define NFS_fhandle_sz (8) #define NFS_sattr_sz (8) #define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) @@ -56,11 +57,11 @@ #define NFS_attrstat_sz (1+NFS_fattr_sz) #define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) -#define NFS_readlinkres_sz (2+1) -#define NFS_readres_sz (1+NFS_fattr_sz+1+1) +#define NFS_readlinkres_sz (2+NFS_pagepad_sz) +#define NFS_readres_sz (1+NFS_fattr_sz+1+NFS_pagepad_sz) #define NFS_writeres_sz (NFS_attrstat_sz) #define NFS_stat_sz (1) -#define NFS_readdirres_sz (1+1) +#define NFS_readdirres_sz (1+NFS_pagepad_sz) #define NFS_statfsres_sz (1+NFS_info_sz) static int nfs_stat_to_errno(enum nfs_stat); @@ -592,8 +593,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, const struct nfs_readlinkargs *args = data; encode_fhandle(xdr, args->fh); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->pglen, NFS_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen, + NFS_readlinkres_sz - NFS_pagepad_sz); } /* @@ -628,8 +629,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, const struct nfs_pgio_args *args = data; encode_readargs(xdr, args); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, NFS_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + NFS_readres_sz - NFS_pagepad_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -786,8 +787,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, const struct nfs_readdirargs *args = data; encode_readdirargs(xdr, args); - rpc_prepare_reply_pages(req, args->pages, 0, - args->count, NFS_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS_readdirres_sz - NFS_pagepad_sz); } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 69971f6c840d..ca10072644ff 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -33,6 +33,7 @@ * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ +#define NFS3_pagepad_sz (1) /* Page padding */ #define NFS3_fhandle_sz (1+16) #define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ #define NFS3_sattr_sz (15) @@ -69,13 +70,13 @@ #define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1) -#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) #define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) -#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz) #define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) #define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) @@ -85,7 +86,8 @@ #define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ - XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1) + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+\ + NFS3_pagepad_sz) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) static int nfs3_stat_to_errno(enum nfs_stat); @@ -909,8 +911,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, const struct nfs3_readlinkargs *args = data; encode_nfs_fh3(xdr, args->fh); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->pglen, NFS3_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen, + NFS3_readlinkres_sz - NFS3_pagepad_sz); } /* @@ -939,7 +941,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, const void *data) { const struct nfs_pgio_args *args = data; - unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; + unsigned int replen = args->replen ? args->replen : + NFS3_readres_sz - NFS3_pagepad_sz; encode_read3args(xdr, args); rpc_prepare_reply_pages(req, args->pages, args->pgbase, @@ -1239,8 +1242,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdir3args(xdr, args); - rpc_prepare_reply_pages(req, args->pages, 0, - args->count, NFS3_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS3_readdirres_sz - NFS3_pagepad_sz); } /* @@ -1281,8 +1284,8 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdirplus3args(xdr, args); - rpc_prepare_reply_pages(req, args->pages, 0, - args->count, NFS3_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS3_readdirres_sz - NFS3_pagepad_sz); } /* @@ -1328,7 +1331,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, if (args->mask & (NFS_ACL | NFS_DFACL)) { rpc_prepare_reply_pages(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, - ACL3_getaclres_sz); + ACL3_getaclres_sz - NFS3_pagepad_sz); req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; } } @@ -1648,7 +1651,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; - result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2); + result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c16b93df1bc1..3899ef3047f4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -84,6 +84,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, /* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ +#define pagepad_maxsz (1) #define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) #define lock_owner_id_maxsz (1 + 1 + 4) #define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -215,14 +216,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, nfs4_fattr_bitmap_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz + 1) + decode_verifier_maxsz + pagepad_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) -#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) #define encode_write_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 4) #define decode_write_maxsz (op_decode_hdr_maxsz + \ @@ -284,14 +285,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define encode_getacl_maxsz (encode_getattr_maxsz) #define decode_getacl_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 1 + 1) + nfs4_fattr_bitmap_maxsz + 1 + pagepad_maxsz) #define encode_setacl_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_setacl_maxsz (decode_setattr_maxsz) #define encode_fs_locations_maxsz \ (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ - (1) + (pagepad_maxsz) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) #define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) @@ -393,12 +394,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ 1 /* notification bitmap, word 0 */ + \ - 1 /* possible XDR padding */) + pagepad_maxsz /* possible XDR padding */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ - XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1) + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ + pagepad_maxsz) #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ 2 /* offset */ + \ 2 /* length */ + \ @@ -2342,7 +2344,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_layoutget(xdr, args->lg_args, &hdr); rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, args->lg_args->layout.pglen, - hdr.replen); + hdr.replen - pagepad_maxsz); } encode_nops(&hdr); } @@ -2388,7 +2390,7 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_layoutget(xdr, args->lg_args, &hdr); rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, args->lg_args->layout.pglen, - hdr.replen); + hdr.replen - pagepad_maxsz); } encode_nops(&hdr); } @@ -2499,7 +2501,7 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, encode_readlink(xdr, args, req, &hdr); rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->pglen, hdr.replen); + args->pglen, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -2520,7 +2522,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, encode_readdir(xdr, args, req, &hdr); rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + args->count, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -2541,7 +2543,7 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, encode_read(xdr, args, &hdr); rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + args->count, hdr.replen - pagepad_maxsz); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); } @@ -2588,7 +2590,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); rpc_prepare_reply_pages(req, args->acl_pages, 0, - args->acl_len, replen + 1); + args->acl_len, replen); encode_nops(&hdr); } @@ -2810,7 +2812,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, } rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, - PAGE_SIZE, replen + 1); + PAGE_SIZE, replen); encode_nops(&hdr); } @@ -3014,14 +3016,14 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz; + replen = hdr.replen + op_decode_hdr_maxsz + 2; encode_getdeviceinfo(xdr, args, &hdr); /* set up reply kvec. device_addr4 opaque data is read into the * pages */ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen, replen + 2 + 1); + args->pdev->pglen, replen); encode_nops(&hdr); } @@ -3043,7 +3045,7 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_layoutget(xdr, args, &hdr); rpc_prepare_reply_pages(req, args->layout.pages, 0, - args->layout.pglen, hdr.replen); + args->layout.pglen, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3259120462ed..612f0a641f4c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { - /* Subtract one to force an extra word of buffer space for the - * payload's XDR pad to fall into the rcv_buf's tail iovec. - */ - hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1; + hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 3ce0a5daa9eb..5a450055469f 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -193,9 +193,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, tail->iov_base = buf + offset; tail->iov_len = buflen - offset; - if ((xdr->page_len & 3) == 0) - tail->iov_len -= sizeof(__be32); - xdr->buflen += len; } EXPORT_SYMBOL_GPL(xdr_inline_pages); From 0279024f22705128c7139bd55af6981afe90e876 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Nov 2020 21:21:11 -0500 Subject: [PATCH 57/90] SUNRPC: Fix up xdr_set_page() While we always want to align to the next page and/or the beginning of the tail buffer when we call xdr_set_next_page(), the functions xdr_align_data() and xdr_expand_hole() really want to align to the next object in that next page or tail. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 5a450055469f..ddd5cc2281ab 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1019,8 +1019,10 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, unsigned int len) { - if (xdr_set_page_base(xdr, base, len) == 0) - xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr)); + if (xdr_set_page_base(xdr, base, len) == 0) { + base -= xdr->buf->page_len; + xdr_set_iov(xdr, xdr->buf->tail, base, len); + } } static void xdr_set_next_page(struct xdr_stream *xdr) @@ -1029,17 +1031,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; newbase -= xdr->buf->page_base; - - xdr_set_page(xdr, newbase, PAGE_SIZE); + if (newbase < xdr->buf->page_len) + xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); + else + xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) { if (xdr->page_ptr != NULL) xdr_set_next_page(xdr); - else if (xdr->iov == xdr->buf->head) { - xdr_set_page(xdr, 0, PAGE_SIZE); - } + else if (xdr->iov == xdr->buf->head) + xdr_set_page(xdr, 0, xdr_stream_remaining(xdr)); return xdr->p != xdr->end; } @@ -1277,7 +1280,7 @@ uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length } xdr->nwords -= XDR_QUADLEN(length); - xdr_set_page(xdr, from + length, PAGE_SIZE); + xdr_set_page(xdr, from + length, xdr_stream_remaining(xdr)); return length; } EXPORT_SYMBOL_GPL(xdr_align_data); @@ -1314,7 +1317,7 @@ uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t lengt _zero_pages(buf->pages, buf->page_base + offset, length); buf->len += length - (from - offset) - truncated; - xdr_set_page(xdr, offset + length, PAGE_SIZE); + xdr_set_page(xdr, offset + length, xdr_stream_remaining(xdr)); return length; } EXPORT_SYMBOL_GPL(xdr_expand_hole); From eee1f54964fe868da425fe52a03666377335de01 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Nov 2020 21:39:02 -0500 Subject: [PATCH 58/90] SUNRPC: Fix open coded xdr_stream_remaining() Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index ddd5cc2281ab..c852d199c789 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1261,7 +1261,7 @@ uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; + bytes = xdr_stream_remaining(xdr); if (length < bytes) bytes = length; @@ -1298,7 +1298,7 @@ uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t lengt xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; + bytes = xdr_stream_remaining(xdr); if (offset + length + bytes > buf->page_len) { unsigned int shift = (offset + length + bytes) - buf->page_len; From 17068466ad02d3ec07ab1b8f3f97928598affc9a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Nov 2020 21:41:08 -0500 Subject: [PATCH 59/90] NFSv4: Fix open coded xdr_stream_remaining() Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3899ef3047f4..de69276cfd22 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5337,11 +5337,11 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, res->acl_len = attrlen; /* Check for receive buffer overflow */ - if (res->acl_len > (xdr->nwords << 2) || + if (res->acl_len > xdr_stream_remaining(xdr) || res->acl_len + res->acl_data_offset > xdr->buf->page_len) { res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %u\n", - attrlen, xdr->nwords << 2); + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); } } else status = -EOPNOTSUPP; From b6d49ecd1081740b6e632366428b960461f8158b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 25 Nov 2020 12:06:14 -0500 Subject: [PATCH 60/90] NFSv4: Fix a pNFS layout related use-after-free race when freeing the inode When returning the layout in nfs4_evict_inode(), we need to ensure that the layout is actually done being freed before we can proceed to free the inode itself. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4super.c | 2 +- fs/nfs/pnfs.c | 33 +++++++++++++++++++++++++++++++-- fs/nfs/pnfs.h | 5 +++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 93f5c1678ec2..984cc42ee54d 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -67,7 +67,7 @@ static void nfs4_evict_inode(struct inode *inode) nfs_inode_evict_delegation(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); + pnfs_destroy_layout_final(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); nfs4_xattr_cache_zap(inode); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0e50b9d45c32..07f59dc8cb2e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -294,6 +294,7 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode; + unsigned long i_state; if (!lo) return; @@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); + i_state = inode->i_state; spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); + /* Notify pnfs_destroy_layout_final() that we're done */ + if (i_state & (I_FREEING | I_CLEAR)) + wake_up_var(lo); } } @@ -734,8 +739,7 @@ pnfs_free_lseg_list(struct list_head *free_me) } } -void -pnfs_destroy_layout(struct nfs_inode *nfsi) +static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); @@ -753,9 +757,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); + return lo; +} + +void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + __pnfs_destroy_layout(nfsi); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); +static bool pnfs_layout_removed(struct nfs_inode *nfsi, + struct pnfs_layout_hdr *lo) +{ + bool ret; + + spin_lock(&nfsi->vfs_inode.i_lock); + ret = nfsi->layout != lo; + spin_unlock(&nfsi->vfs_inode.i_lock); + return ret; +} + +void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + + if (lo) + wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); +} + static bool pnfs_layout_add_bulk_destroy_list(struct inode *inode, struct list_head *layout_list) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f618c49697bb..bbd3de1025f2 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -268,6 +268,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_layoutget_free(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_layout_final(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, @@ -712,6 +713,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { } +static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { From d18a9d3fa0f27a47706fb67f1ee0f4d971587c4e Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Thu, 12 Nov 2020 02:09:51 -0800 Subject: [PATCH 61/90] NFS: NFSv2/NFSv3: Use cred from fs_context during mount There was refactoring done to use the fs_context for mounting done in: 62a55d088cd87: NFS: Additional refactoring for fs_context conversion This made it so that the net_ns is fetched from the fs_context (the netns that fsopen is called in). This change also makes it so that the credential fetched during fsopen is used as well as the net_ns. NFS has already had a number of changes to prepare it for user namespaces: 1a58e8a0e5c1: NFS: Store the credential of the mount process in the nfs_server 264d948ce7d0: NFS: Convert NFSv3 to use the container user namespace c207db2f5da5: NFS: Convert NFSv2 to use the container user namespace Previously, different credentials could be used for creation of the fs_context versus creation of the nfs_server, as FSCONFIG_CMD_CREATE did the actual credential check, and that's where current_creds() were fetched. This meant that the user namespace which fsopen was called in could be a non-init user namespace. This still requires that the user that calls FSCONFIG_CMD_CREATE has CAP_SYS_ADMIN in the init user ns. This roughly allows a privileged user to mount on behalf of an unprivileged usernamespace, by forking off and calling fsopen in the unprivileged user namespace. It can then pass back that fsfd to the privileged process which can configure the NFS mount, and then it can call FSCONFIG_CMD_CREATE before switching back into the mount namespace of the container, and finish up the mounting process and call fsmount and move_mount. Signed-off-by: Sargun Dhillon Tested-by: Alban Crequy Fixes: 62a55d088cd8 ("NFS: Additional refactoring for fs_context conversion") Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f6454ba53d05..ff5c4d0d6d13 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -571,7 +571,7 @@ static int nfs_start_lockd(struct nfs_server *server) 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, - .cred = current_cred(), + .cred = server->cred, }; if (nlm_init.nfs_version > 3) @@ -985,7 +985,7 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (!server) return ERR_PTR(-ENOMEM); - server->cred = get_cred(current_cred()); + server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); From d3ff46fe693683cb9660e9b93e8c932cc8e0c1f8 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Thu, 12 Nov 2020 02:09:52 -0800 Subject: [PATCH 62/90] NFSv4: Refactor to use user namespaces for nfs4idmap In several patches work has been done to enable NFSv4 to use user namespaces: 58002399da65: NFSv4: Convert the NFS client idmapper to use the container user namespace 3b7eb5e35d0f: NFS: When mounting, don't share filesystems between different user namespaces Unfortunately, the userspace APIs were only such that the userspace facing side of the filesystem (superblock s_user_ns) could be set to a non init user namespace. This furthers the fs_context related refactoring, and piggybacks on top of that logic, so the superblock user namespace, and the NFS user namespace are the same. Users can still use rpc.idmapd if they choose to, but there are complexities with user namespaces and request-key that have yet to be addresssed. Eventually, we will need to at least: * Come up with an upcall mechanism that can be triggered inside of the container, or safely triggered outside, with the requisite context to do the right mapping. * Handle whatever refactoring needs to be done in net/sunrpc. Signed-off-by: Sargun Dhillon Tested-by: Alban Crequy Fixes: 62a55d088cd8 ("NFS: Additional refactoring for fs_context conversion") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index be7915c861ce..86acffe7335c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1153,7 +1153,7 @@ struct nfs_server *nfs4_create_server(struct fs_context *fc) if (!server) return ERR_PTR(-ENOMEM); - server->cred = get_cred(current_cred()); + server->cred = get_cred(fc->cred); auth_probe = ctx->auth_info.flavor_len < 1; From 35a6d396721e28ba161595b0fc9e8896c00399bb Mon Sep 17 00:00:00 2001 From: Fedor Tokarev Date: Thu, 15 Oct 2020 16:59:08 +0300 Subject: [PATCH 63/90] net: sunrpc: Fix 'snprintf' return value check in 'do_xprt_debugfs' 'snprintf' returns the number of characters which would have been written if enough space had been available, excluding the terminating null byte. Thus, the return value of 'sizeof(buf)' means that the last character has been dropped. Signed-off-by: Fedor Tokarev Fixes: 2f34b8bfae19 ("SUNRPC: add links for all client xprts to debugfs") Signed-off-by: Trond Myklebust --- net/sunrpc/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index fd9bca242724..56029e3af6ff 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n return 0; len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", xprt->debugfs->d_name.name); - if (len > sizeof(name)) + if (len >= sizeof(name)) return -1; if (*nump == 0) strcpy(link, "xprt"); else { len = snprintf(link, sizeof(link), "xprt%d", *nump); - if (len > sizeof(link)) + if (len >= sizeof(link)) return -1; } debugfs_create_symlink(link, clnt->cl_debugfs, name); From 9b82d88d5976e5f2b8015d58913654856576ace5 Mon Sep 17 00:00:00 2001 From: Calum Mackay Date: Wed, 28 Oct 2020 20:16:27 +0000 Subject: [PATCH 64/90] lockd: don't use interval-based rebinding over TCP NLM uses an interval-based rebinding, i.e. it clears the transport's binding under certain conditions if more than 60 seconds have elapsed since the connection was last bound. This rebinding is not necessary for an autobind RPC client over a connection-oriented protocol like TCP. It can also cause problems: it is possible for nlm_bind_host() to clear XPRT_BOUND whilst a connection worker is in the middle of trying to reconnect, after it had already been checked in xprt_connect(). When the connection worker notices that XPRT_BOUND has been cleared under it, in xs_tcp_finish_connecting(), that results in: xs_tcp_setup_socket: connect returned unhandled error -107 Worse, it's possible that the two can get into lockstep, resulting in the same behaviour repeated indefinitely, with the above error every 300 seconds, without ever recovering, and the connection never being established. This has been seen in practice, with a large number of NLM client tasks, following a server restart. The existing callers of nlm_bind_host & nlm_rebind_host should not need to force the rebind, for TCP, so restrict the interval-based rebinding to UDP only. For TCP, we will still rebind when needed, e.g. on timeout, and connection error (including closure), since connection-related errors on an existing connection, ECONNREFUSED when trying to connect, and rpc_check_timeout(), already unconditionally clear XPRT_BOUND. To avoid having to add the fix, and explanation, to both nlm_bind_host() and nlm_rebind_host(), remove the duplicate code from the former, and have it call the latter. Drop the dprintk, which adds no value over a trace. Signed-off-by: Calum Mackay Fixes: 35f5a422ce1a ("SUNRPC: new interface to force an RPC rebind") Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0afb6d59bad0..771c289f6df7 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -439,12 +439,7 @@ nlm_bind_host(struct nlm_host *host) * RPC rebind is required */ if ((clnt = host->h_rpcclnt) != NULL) { - if (time_after_eq(jiffies, host->h_nextrebind)) { - rpc_force_rebind(clnt); - host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %lu jiffies\n", - host->h_nextrebind - jiffies); - } + nlm_rebind_host(host); } else { unsigned long increment = nlmsvc_timeout; struct rpc_timeout timeparms = { @@ -494,13 +489,20 @@ nlm_bind_host(struct nlm_host *host) return clnt; } -/* - * Force a portmap lookup of the remote lockd port +/** + * nlm_rebind_host - If needed, force a portmap lookup of the peer's lockd port + * @host: NLM host handle for peer + * + * This is not needed when using a connection-oriented protocol, such as TCP. + * The existing autobind mechanism is sufficient to force a rebind when + * required, e.g. on connection state transitions. */ void nlm_rebind_host(struct nlm_host *host) { - dprintk("lockd: rebind host %s\n", host->h_name); + if (host->h_proto != IPPROTO_UDP) + return; + if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(host->h_rpcclnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; From bf701b765eaa82dd164d65edc5747ec7288bb5c3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 27 Nov 2020 11:24:33 +1100 Subject: [PATCH 65/90] NFS: switch nfsiod to be an UNBOUND workqueue. nfsiod is currently a concurrency-managed workqueue (CMWQ). This means that workitems scheduled to nfsiod on a given CPU are queued behind all other work items queued on any CMWQ on the same CPU. This can introduce unexpected latency. Occaionally nfsiod can even cause excessive latency. If the work item to complete a CLOSE request calls the final iput() on an inode, the address_space of that inode will be dismantled. This takes time proportional to the number of in-memory pages, which on a large host working on large files (e.g.. 5TB), can be a large number of pages resulting in a noticable number of seconds. We can avoid these latency problems by switching nfsiod to WQ_UNBOUND. This causes each concurrent work item to gets a dedicated thread which can be scheduled to an idle CPU. There is precedent for this as several other filesystems use WQ_UNBOUND workqueue for handling various async events. Signed-off-by: NeilBrown Fixes: ada609ee2ac2 ("workqueue: use WQ_MEM_RECLAIM instead of WQ_RESCUER") Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9b765a900b28..522aa10a1a3e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2173,7 +2173,7 @@ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); - wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; From fa94a951bf3553fee66e2c879b97da97bcf00589 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 3 Dec 2020 12:04:51 -0500 Subject: [PATCH 66/90] NFSv4.2: Fix up the get/listxattr calls to rpc_prepare_reply_pages() Ensure that both getxattr and listxattr page array are correctly aligned, and that getxattr correctly accounts for the page padding word. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 8432bd6b95f0..103978ff76c9 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -191,7 +191,7 @@ #define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \ nfs4_xattr_name_maxsz) -#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + 1) +#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) #define encode_setxattr_maxsz (op_encode_hdr_maxsz + \ 1 + nfs4_xattr_name_maxsz + 1) #define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) @@ -1476,17 +1476,18 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + uint32_t replen; size_t plen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 1; encode_getxattr(xdr, args->xattr_name, &hdr); plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, - hdr.replen); + rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, replen); req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; encode_nops(&hdr); @@ -1520,14 +1521,15 @@ static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1; encode_listxattrs(xdr, args, &hdr); - rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, - hdr.replen); + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen); encode_nops(&hdr); } From ac9645c87380e39a8fa87a1b51721efcdea89dbf Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Sat, 5 Dec 2020 11:28:35 +0200 Subject: [PATCH 67/90] sunrpc: fix xs_read_xdr_buf for partial pages receive When receiving pages data, return value 'ret' when positive includes `buf->page_base`, so we should subtract that before it is used for changing `offset` and comparing against `want`. This was discovered on the very rare cases where the server returned a chunk of bytes that when added to the already received amount of bytes for the pages happened to match the current `recv.len`, for example on this case: buf->page_base : 258356 actually received from socket: 1740 ret : 260096 want : 260096 In this case neither of the two 'if ... goto out' trigger, and we continue to tail parsing. Worth to mention that the ensuing EMSGSIZE from the continued execution of `xs_read_xdr_buf` may be observed by an application due to 4 superfluous bytes being added to the pages data. Fixes: 277e4ab7d530 ("SUNRPC: Simplify TCP receive code by switching to using iterators") Signed-off-by: Dan Aloni Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c93ff70da3f9..c56a66cdf4ac 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); - offset += ret - buf->page_base; + ret -= buf->page_base; + offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) From a1f26739ccdcc6967617998bd200dd907f7ff80a Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Wed, 2 Dec 2020 00:34:11 +0000 Subject: [PATCH 68/90] NFSv4.2: improve page handling for GETXATTR XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport, and it's easy enough to allocate enough pages for the request up front, so do that. Also, since we've allocated the pages anyway, use the full page aligned length for the receive buffer. This will allow caching of valid replies that are too large for the caller, but that still fit in the allocated pages. Signed-off-by: Frank van der Linden Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 47 +++++++++++++++++++++++++++++++++++----------- fs/nfs/nfs42xdr.c | 13 ++++++++----- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 4fc61e3d098d..b9836e2ce4a2 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1173,14 +1173,12 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, } static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, - void *buf, size_t buflen) + void *buf, size_t buflen, struct page **pages, + size_t plen) { struct nfs_server *server = NFS_SERVER(inode); - struct page *pages[NFS4XATTR_MAXPAGES] = {}; struct nfs42_getxattrargs arg = { .fh = NFS_FH(inode), - .xattr_pages = pages, - .xattr_len = buflen, .xattr_name = name, }; struct nfs42_getxattrres res; @@ -1189,7 +1187,10 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, .rpc_argp = &arg, .rpc_resp = &res, }; - int ret, np; + ssize_t ret; + + arg.xattr_len = plen; + arg.xattr_pages = pages; ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); @@ -1214,10 +1215,6 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, _copy_from_pages(buf, pages, 0, res.xattr_len); } - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); - while (--np >= 0) - __free_page(pages[np]); - return res.xattr_len; } @@ -1292,16 +1289,44 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, void *buf, size_t buflen) { struct nfs4_exception exception = { }; - ssize_t err; + ssize_t err, np, i; + struct page **pages; + np = nfs_page_array_len(0, buflen ?: XATTR_SIZE_MAX); + pages = kmalloc_array(np, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) { + np = i + 1; + goto out; + } + } + + /* + * The GETXATTR op has no length field in the call, and the + * xattr data is at the end of the reply. + * + * There is no downside in using the page-aligned length. It will + * allow receiving and caching xattrs that are too large for the + * caller but still fit in the page-rounded value. + */ do { - err = _nfs42_proc_getxattr(inode, name, buf, buflen); + err = _nfs42_proc_getxattr(inode, name, buf, buflen, + pages, np * PAGE_SIZE); if (err >= 0) break; err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); +out: + while (--np >= 0) + __free_page(pages[np]); + kfree(pages); + return err; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 103978ff76c9..c0b8fcd266c9 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr, return -EIO; len = be32_to_cpup(p); + + /* + * Only check against the page length here. The actual + * requested length may be smaller, but that is only + * checked against after possibly caching a valid reply. + */ if (len > req->rq_rcv_buf.page_len) return -ERANGE; @@ -1477,7 +1483,6 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; uint32_t replen; - size_t plen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); @@ -1485,10 +1490,8 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, replen = hdr.replen + op_decode_hdr_maxsz + 1; encode_getxattr(xdr, args->xattr_name, &hdr); - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; - - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, replen); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len, + replen); encode_nops(&hdr); } From 15261b9126cd5bb2ad8521da49d8f5c042d904c7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 8 Dec 2020 18:29:02 -0500 Subject: [PATCH 69/90] xprtrdma: Fix XDRBUF_SPARSE_PAGES support Olga K. observed that rpcrdma_marsh_req() allocates sparse pages only when it has determined that a Reply chunk is necessary. There are plenty of cases where no Reply chunk is needed, but the XDRBUF_SPARSE_PAGES flag is set. The result would be a crash in rpcrdma_inline_fixup() when it tries to copy parts of the received Reply into a missing page. To avoid crashing, handle sparse page allocation up front. Until XATTR support was added, this issue did not appear often because the only SPARSE_PAGES consumer always expected a reply large enough to always require a Reply chunk. Reported-by: Olga Kornievskaia Signed-off-by: Chuck Lever Cc: Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 40 ++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0f5120c7668f..c48536f2121f 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, r_xprt->rx_ep->re_max_inline_recv; } +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } + + return 0; +} + /* Split @vec on page boundaries into SGEs. FMR registers pages, not * a byte range. Other modes coalesce these SGEs into a single MR * when they can. @@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; seg->mr_offset = (char *)page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); @@ -867,6 +883,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), rqst); From 1f70ea700909d77d5658c33b6bf13e9123416ff1 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 7 Dec 2020 17:03:52 +0800 Subject: [PATCH 70/90] NFSv4.1: use BITS_PER_LONG macro in nfs4session.h Use the existing BITS_PER_LONG macro instead of calculating the value. Signed-off-by: Geliang Tang Signed-off-by: Trond Myklebust --- fs/nfs/nfs4session.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index b996ee23f1ba..3de425f59b3a 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -34,7 +34,7 @@ enum nfs4_slot_tbl_state { NFS4_SLOT_TBL_DRAINING, }; -#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, BITS_PER_LONG) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ struct nfs4_slot *slots; /* seqid per slot */ From c54e959b36cbdb0cb2f2805e3e945dd83476a5c7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Dec 2020 13:30:46 -0500 Subject: [PATCH 71/90] SUNRPC: _shift_data_left/right_pages should check the shift length Exit early if the shift is zero. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index c852d199c789..5833329c132c 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -225,6 +225,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, BUG_ON(pgfrom_base <= pgto_base); + if (!len) + return; + pgto = pages + (pgto_base >> PAGE_SHIFT); pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); @@ -307,6 +310,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, BUG_ON(pgto_base <= pgfrom_base); + if (!len) + return; + pgto_base += len; pgfrom_base += len; @@ -405,6 +411,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) char *vto; size_t copy; + if (!len) + return; + pgto = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -449,6 +458,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) char *vfrom; size_t copy; + if (!len) + return; + pgfrom = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; From 9a20f6f4e6ba9713605fbf7e7426ca22f1181545 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 4 Dec 2020 15:16:46 -0500 Subject: [PATCH 72/90] SUNRPC: Fixes for xdr_align_data() The main use case right now for xdr_align_data() is to shift the page data to the left, and in practice shrink the total XDR data buffer. This patch ensures that we fix up the accounting for the buffer length as we shift that data around. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 2 +- net/sunrpc/xdr.c | 174 ++++++++++++++++++++++++++++--------- 2 files changed, 133 insertions(+), 43 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 9548d075e06d..2b4e44bb0654 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -252,7 +252,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); -extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t); +extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length); extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t); /** diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 5833329c132c..c474339ba9ac 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -266,26 +266,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static void -_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len) -{ - struct kvec *tail = buf->tail; - - if (len > tail->iov_len) - len = tail->iov_len; - - _copy_to_pages(buf->pages, - buf->page_base + pgto, - (char *)tail->iov_base, - len); - tail->iov_len -= len; - - if (tail->iov_len > 0) - memmove((char *)tail->iov_base, - tail->iov_base + len, - tail->iov_len); -} - /** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. @@ -516,6 +496,109 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len) } while ((len -= zero) != 0); } +static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, + unsigned int len, unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len) + return; + if (len > tail->iov_len - base) + len = tail->iov_len - base; + /* Shift data into head */ + if (shift > buf->page_len + base) { + const struct kvec *head = buf->head; + unsigned int hdto = + head->iov_len + buf->page_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + buf->page_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + /* Shift data into pages */ + if (shift > base) { + unsigned int pgto = buf->page_len + base - shift; + unsigned int pglen = len; + + if (pgto + pglen > buf->page_len) + pglen = buf->page_len - pgto; + _copy_to_pages(buf->pages, buf->page_base + pgto, + tail->iov_base + base, pglen); + base += pglen; + len -= pglen; + if (!len) + return; + } + memmove(tail->iov_base + base - shift, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + unsigned int pgto; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + /* Shift data into head */ + if (shift > base) { + const struct kvec *head = buf->head; + unsigned int hdto = head->iov_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + _copy_from_pages(head->iov_base + hdto, buf->pages, + buf->page_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + pgto = base - shift; + _shift_data_left_pages(buf->pages, buf->page_base + pgto, + buf->page_base + base, len); +} + +static void xdr_buf_tail_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + xdr_buf_tail_copy_left(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift); + return; + } + xdr_buf_pages_copy_left(buf, base, len, shift); + len += base; + if (len <= buf->page_len) + return; + xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); +} + /** * xdr_shrink_bufhead * @buf: xdr_buf @@ -1261,38 +1344,45 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) } EXPORT_SYMBOL_GPL(xdr_read_pages); -uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length) +unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int from, bytes; - unsigned int shift = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, bytes, len; + unsigned int shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr_stream_remaining(xdr); - if (length < bytes) - bytes = length; + + if (from >= buf->page_len + buf->tail->iov_len) + return 0; + if (from + buf->head->iov_len >= buf->len) + return 0; + + len = buf->len - buf->head->iov_len; + + /* We only shift data left! */ + if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n", + from, offset)) + return 0; + if (WARN_ONCE(offset > buf->page_len, + "SUNRPC: buffer overflow. offset=%u, page_len=%u\n", + offset, buf->page_len)) + return 0; /* Move page data to the left */ - if (from > offset) { - shift = min_t(unsigned int, bytes, buf->page_len - from); - _shift_data_left_pages(buf->pages, - buf->page_base + offset, - buf->page_base + from, - shift); - bytes -= shift; + shift = from - offset; + xdr_buf_pages_shift_left(buf, from, len, shift); + xdr->buf->len -= shift; + xdr->nwords -= XDR_QUADLEN(shift); - /* Move tail data into the pages, if necessary */ - if (bytes > 0) - _shift_data_left_tail(buf, offset + shift, bytes); - } + bytes = xdr_stream_remaining(xdr); + if (length > bytes) + length = bytes; + bytes -= length; xdr->nwords -= XDR_QUADLEN(length); - xdr_set_page(xdr, from + length, xdr_stream_remaining(xdr)); + xdr_set_page(xdr, offset + length, bytes); return length; } EXPORT_SYMBOL_GPL(xdr_align_data); From c4f2f591f02c392ea7de018d2733748bf4c7b5f5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 4 Dec 2020 17:15:09 -0500 Subject: [PATCH 73/90] SUNRPC: Fix xdr_expand_hole() We do want to try to grow the buffer if possible, but if that attempt fails, we still want to move the data and truncate the XDR message. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 2 +- net/sunrpc/xdr.c | 270 ++++++++++++++++++++++++------------- 2 files changed, 178 insertions(+), 94 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 2b4e44bb0654..178f499e2283 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -253,7 +253,7 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length); -extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t); +extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length); /** * xdr_stream_remaining - Return the number of bytes remaining in the stream diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index c474339ba9ac..e0906ed24374 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -334,46 +334,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static unsigned int -_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len) -{ - struct kvec *tail = buf->tail; - unsigned int tailbuf_len; - unsigned int result = 0; - size_t copy; - - tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; - - /* Shift the tail first */ - if (tailbuf_len != 0) { - unsigned int free_space = tailbuf_len - tail->iov_len; - - if (len < free_space) - free_space = len; - if (len > free_space) - len = free_space; - - tail->iov_len += free_space; - copy = len; - - if (tail->iov_len > len) { - char *p = (char *)tail->iov_base + len; - memmove(p, tail->iov_base, tail->iov_len - free_space); - result += tail->iov_len - free_space; - } else - copy = tail->iov_len; - - /* Copy from the inlined pages into the tail */ - _copy_from_pages((char *)tail->iov_base, - buf->pages, - buf->page_base + pgfrom, - copy); - result += copy; - } - - return result; -} - /** * _copy_to_pages * @pages: array of pages @@ -464,18 +424,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) } EXPORT_SYMBOL_GPL(_copy_from_pages); +static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base, + unsigned int len) +{ + if (base >= iov->iov_len) + return; + if (len > iov->iov_len - base) + len = iov->iov_len - base; + memset(iov->iov_base + base, 0, len); +} + /** - * _zero_pages - * @pages: array of pages - * @pgbase: beginning page vector address + * xdr_buf_pages_zero + * @buf: xdr_buf + * @pgbase: beginning offset * @len: length */ -static void -_zero_pages(struct page **pages, size_t pgbase, size_t len) +static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, + unsigned int len) { + struct page **pages = buf->pages; struct page **page; char *vpage; - size_t zero; + unsigned int zero; + + if (!len) + return; + if (pgbase >= buf->page_len) { + xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len); + return; + } + if (pgbase + len > buf->page_len) { + xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len); + len = buf->page_len - pgbase; + } + + pgbase += buf->page_base; page = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -496,6 +480,103 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len) } while ((len -= zero) != 0); } +static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; + unsigned int free_space; + + if (sum > buf->len) { + free_space = min_t(unsigned int, sum - buf->len, len); + buf->len += free_space; + len -= free_space; + if (!len) + return; + } + + if (buf->buflen > sum) { + /* Expand the tail buffer */ + free_space = min_t(unsigned int, buf->buflen - sum, len); + tail->iov_len += free_space; + buf->len += free_space; + } +} + +static void xdr_buf_tail_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + + if (to >= tail->iov_len) + return; + if (len + to > tail->iov_len) + len = tail->iov_len - to; + memmove(tail->iov_base + to, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0; + unsigned int talen = 0, tato = 0; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + if (to >= buf->page_len) { + tato = to - buf->page_len; + if (tail->iov_len >= len + tato) + talen = len; + else if (tail->iov_len > tato) + talen = tail->iov_len - tato; + } else if (len + to >= buf->page_len) { + pglen = buf->page_len - to; + talen = len - pglen; + if (talen > tail->iov_len) + talen = tail->iov_len; + } else + pglen = len; + + _copy_from_pages(tail->iov_base + tato, buf->pages, + buf->page_base + base + pglen, talen); + _shift_data_right_pages(buf->pages, buf->page_base + to, + buf->page_base + base, pglen); +} + +static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len || !shift || !len) + return; + xdr_buf_tail_copy_right(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift); + return; + } + if (base + len > buf->page_len) + xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len, + shift); + xdr_buf_pages_copy_right(buf, base, len, shift); +} + static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { @@ -685,30 +766,33 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) } /** - * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes + * xdr_shrink_pagelen - shrinks buf->pages to @len bytes * @buf: xdr_buf - * @len: bytes to remove from buf->pages + * @len: new page buffer length * * The extra data is not lost, but is instead moved into buf->tail. * Returns the actual number of bytes moved. */ -static unsigned int -xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) { - unsigned int pglen = buf->page_len; - unsigned int result; + unsigned int shift, buflen = buf->len - buf->head->iov_len; - if (len > buf->page_len) - len = buf-> page_len; - - result = _shift_data_right_tail(buf, pglen - len, len); - buf->page_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + WARN_ON_ONCE(len > buf->page_len); + if (buf->head->iov_len >= buf->len || len > buflen) + buflen = len; + if (buf->page_len > buflen) { + buf->buflen -= buf->page_len - buflen; + buf->page_len = buflen; + } + if (len >= buf->page_len) + return 0; + shift = buf->page_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, len, buflen - len, shift); + buf->page_len = len; + buf->len -= shift; + buf->buflen -= shift; + return shift; } void @@ -728,6 +812,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_stream_pos); +static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + unsigned int blen = xdr->buf->len; + + xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0; +} + +static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len); +} + /** * xdr_page_pos - Return the current offset from the start of the xdr pages * @xdr: pointer to struct xdr_stream @@ -1291,7 +1387,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) struct xdr_buf *buf = xdr->buf; unsigned int nwords = XDR_QUADLEN(len); unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; if (xdr->nwords == 0) return 0; @@ -1305,9 +1401,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) len = buf->page_len; else if (nwords < xdr->nwords) { /* Truncate page data and move it into the tail */ - offset = buf->page_len - len; - copied = xdr_shrink_pagelen(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); + copied = xdr_shrink_pagelen(buf, len); + trace_rpc_xdr_alignment(xdr, len, copied); xdr->nwords = XDR_QUADLEN(buf->len - cur); } return len; @@ -1387,39 +1482,28 @@ unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, } EXPORT_SYMBOL_GPL(xdr_align_data); -uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length) +unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int bytes; - unsigned int from; - unsigned int truncated = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, to, shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr_stream_remaining(xdr); + to = xdr_align_size(offset + length); - if (offset + length + bytes > buf->page_len) { - unsigned int shift = (offset + length + bytes) - buf->page_len; - unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift); - truncated = shift - res; - xdr->nwords -= XDR_QUADLEN(truncated); - bytes -= shift; - } + /* Could the hole be behind us? */ + if (to > from) { + unsigned int buflen = buf->len - buf->head->iov_len; + shift = to - from; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, from, buflen, shift); + xdr_stream_page_set_pos(xdr, to); + } else if (to != from) + xdr_align_data(xdr, to, 0); + xdr_buf_pages_zero(buf, offset, length); - /* Now move the page data over and zero pages */ - if (bytes > 0) - _shift_data_right_pages(buf->pages, - buf->page_base + offset + length, - buf->page_base + from, - bytes); - _zero_pages(buf->pages, buf->page_base + offset, length); - - buf->len += length - (from - offset) - truncated; - xdr_set_page(xdr, offset + length, xdr_stream_remaining(xdr)); + xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); return length; } EXPORT_SYMBOL_GPL(xdr_expand_hole); From 6707fbd7d3be72da4ebde7f56b46814befd2db19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 6 Dec 2020 10:15:04 -0500 Subject: [PATCH 74/90] SUNRPC: Cleanup xdr_shrink_bufhead() Clean up xdr_shrink_bufhead() to use the new helpers instead of doing its own thing. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 164 +++++++++++++++++++++++++---------------------- 1 file changed, 87 insertions(+), 77 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e0906ed24374..19eaa38f7d16 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -550,6 +550,53 @@ static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, buf->page_base + base, pglen); } +static void xdr_buf_head_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0, pgto = 0; + unsigned int talen = 0, tato = 0; + + if (base >= head->iov_len) + return; + if (len > head->iov_len - base) + len = head->iov_len - base; + if (to >= buf->page_len + head->iov_len) { + tato = to - buf->page_len - head->iov_len; + talen = len; + } else if (to >= head->iov_len) { + pgto = to - head->iov_len; + pglen = len; + if (pgto + pglen > buf->page_len) { + talen = pgto + pglen - buf->page_len; + pglen -= talen; + } + } else { + pglen = len - to; + if (pglen > buf->page_len) { + talen = pglen - buf->page_len; + pglen = buf->page_len; + } + } + + len -= talen; + base += len; + if (talen + tato > tail->iov_len) + talen = tail->iov_len > tato ? tail->iov_len - tato : 0; + memcpy(tail->iov_base + tato, head->iov_base + base, talen); + + len -= pglen; + base -= pglen; + _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base, + pglen); + + base -= len; + memmove(head->iov_base + to, head->iov_base + base, len); +} + static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) @@ -577,6 +624,25 @@ static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, xdr_buf_pages_copy_right(buf, base, len, shift); } +static void xdr_buf_head_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + + if (!shift) + return; + if (base >= head->iov_len) { + xdr_buf_pages_shift_right(buf, head->iov_len - base, len, + shift); + return; + } + if (base + len > head->iov_len) + xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len, + shift); + xdr_buf_head_copy_right(buf, base, len, shift); +} + static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, unsigned int len, unsigned int shift) { @@ -683,86 +749,31 @@ static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, /** * xdr_shrink_bufhead * @buf: xdr_buf - * @len: bytes to remove from buf->head[0] + * @len: new length of buf->head[0] * - * Shrinks XDR buffer's header kvec buf->head[0] by + * Shrinks XDR buffer's header kvec buf->head[0], setting it to * 'len' bytes. The extra data is not lost, but is instead * moved into the inlined pages and/or the tail. */ -static unsigned int -xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len) { - struct kvec *head, *tail; - size_t copy, offs; - unsigned int pglen = buf->page_len; - unsigned int result; - - result = 0; - tail = buf->tail; - head = buf->head; + struct kvec *head = buf->head; + unsigned int shift, buflen = max(buf->len, len); WARN_ON_ONCE(len > head->iov_len); - if (len > head->iov_len) - len = head->iov_len; - - /* Shift the tail first */ - if (tail->iov_len != 0) { - if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove((char *)tail->iov_base + len, - tail->iov_base, copy); - result += copy; - } - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > pglen) - copy = pglen; - offs = len - copy; - if (offs >= tail->iov_len) - copy = 0; - else if (copy > tail->iov_len - offs) - copy = tail->iov_len - offs; - if (copy != 0) { - _copy_from_pages((char *)tail->iov_base + offs, - buf->pages, - buf->page_base + pglen + offs - len, - copy); - result += copy; - } - /* Do we also need to copy data from the head into the tail ? */ - if (len > pglen) { - offs = copy = len - pglen; - if (copy > tail->iov_len) - copy = tail->iov_len; - memcpy(tail->iov_base, - (char *)head->iov_base + - head->iov_len - offs, - copy); - result += copy; - } + if (head->iov_len > buflen) { + buf->buflen -= head->iov_len - buflen; + head->iov_len = buflen; } - /* Now handle pages */ - if (pglen != 0) { - if (pglen > len) - _shift_data_right_pages(buf->pages, - buf->page_base + len, - buf->page_base, - pglen - len); - copy = len; - if (len > pglen) - copy = pglen; - _copy_to_pages(buf->pages, buf->page_base, - (char *)head->iov_base + head->iov_len - len, - copy); - result += copy; - } - head->iov_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + if (len >= head->iov_len) + return 0; + shift = head->iov_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_head_shift_right(buf, len, buflen - len, shift); + head->iov_len = len; + buf->buflen -= shift; + buf->len -= shift; + return shift; } /** @@ -798,7 +809,7 @@ static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) void xdr_shift_buf(struct xdr_buf *buf, size_t len) { - xdr_shrink_bufhead(buf, len); + xdr_shrink_bufhead(buf, buf->head->iov_len - len); } EXPORT_SYMBOL_GPL(xdr_shift_buf); @@ -1371,13 +1382,12 @@ static void xdr_realign_pages(struct xdr_stream *xdr) struct xdr_buf *buf = xdr->buf; struct kvec *iov = buf->head; unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; /* Realign pages to current pointer position */ if (iov->iov_len > cur) { - offset = iov->iov_len - cur; - copied = xdr_shrink_bufhead(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); + copied = xdr_shrink_bufhead(buf, cur); + trace_rpc_xdr_alignment(xdr, cur, copied); xdr->nwords = XDR_QUADLEN(buf->len - cur); } } From e43ac22b83921928479da0bad25aaee3d95c2b1a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 6 Dec 2020 12:41:41 -0500 Subject: [PATCH 75/90] SUNRPC: _copy_to/from_pages() now check for zero length Clean up callers of _copy_to/from_pages() that still check for a zero length. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 19eaa38f7d16..01918e60b67b 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1665,8 +1665,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); + _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1696,8 +1695,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); + _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); From 5a5f1c2c2cbb6ddef637abb7c7e7cab20b9cc933 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 15:02:14 -0500 Subject: [PATCH 76/90] SUNRPC: Clean up open coded setting of the xdr_stream 'nwords' field Move the setting of the xdr_stream 'nwords' field into the helpers that reset the xdr_stream cursor. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 01918e60b67b..f0444bf5617c 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1183,6 +1183,15 @@ static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, return len - base; } +static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + + xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len); + return xdr_set_iov(xdr, buf->tail, base, len); +} + static unsigned int xdr_set_page_base(struct xdr_stream *xdr, unsigned int base, unsigned int len) { @@ -1201,6 +1210,7 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, if (len > maxlen) len = maxlen; + xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; @@ -1223,7 +1233,7 @@ static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, { if (xdr_set_page_base(xdr, base, len) == 0) { base -= xdr->buf->page_len; - xdr_set_iov(xdr, xdr->buf->tail, base, len); + xdr_set_tail_base(xdr, base, len); } } @@ -1236,7 +1246,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr) if (newbase < xdr->buf->page_len) xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); else - xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr)); + xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) @@ -1388,7 +1398,7 @@ static void xdr_realign_pages(struct xdr_stream *xdr) if (iov->iov_len > cur) { copied = xdr_shrink_bufhead(buf, cur); trace_rpc_xdr_alignment(xdr, cur, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + xdr_set_page(xdr, 0, buf->page_len); } } @@ -1396,7 +1406,6 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; unsigned int nwords = XDR_QUADLEN(len); - unsigned int cur = xdr_stream_pos(xdr); unsigned int copied; if (xdr->nwords == 0) @@ -1413,7 +1422,6 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) /* Truncate page data and move it into the tail */ copied = xdr_shrink_pagelen(buf, len); trace_rpc_xdr_alignment(xdr, len, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); } return len; } @@ -1439,12 +1447,10 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) if (pglen == 0) return 0; - xdr->nwords -= nwords; base = (nwords << 2) - pglen; end = xdr_stream_remaining(xdr) - pglen; - if (xdr_set_iov(xdr, xdr->buf->tail, base, end) == 0) - xdr->nwords = 0; + xdr_set_tail_base(xdr, base, end); return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); @@ -1478,15 +1484,13 @@ unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, /* Move page data to the left */ shift = from - offset; xdr_buf_pages_shift_left(buf, from, len, shift); - xdr->buf->len -= shift; - xdr->nwords -= XDR_QUADLEN(shift); bytes = xdr_stream_remaining(xdr); if (length > bytes) length = bytes; bytes -= length; - xdr->nwords -= XDR_QUADLEN(length); + xdr->buf->len -= shift; xdr_set_page(xdr, offset + length, bytes); return length; } @@ -1508,12 +1512,11 @@ unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset, shift = to - from; xdr_buf_try_expand(buf, shift); xdr_buf_pages_shift_right(buf, from, buflen, shift); - xdr_stream_page_set_pos(xdr, to); + xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); } else if (to != from) xdr_align_data(xdr, to, 0); xdr_buf_pages_zero(buf, offset, length); - xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); return length; } EXPORT_SYMBOL_GPL(xdr_expand_hole); From f8d0e60f1056687826abc1eded98f0ea067dfc4c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 22:56:18 -0500 Subject: [PATCH 77/90] SUNRPC: Cleanup - constify a number of xdr_buf helpers There are a number of xdr helpers for struct xdr_buf that do not change the structure itself. Mark those as taking const pointers for documentation purposes. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 22 ++++++++-------- net/sunrpc/xdr.c | 53 +++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 40 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 178f499e2283..68d49fdc4ee9 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -128,8 +128,8 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); void xdr_inline_pages(struct xdr_buf *, unsigned int, struct page **, unsigned int, unsigned int); -void xdr_terminate_string(struct xdr_buf *, const u32); -size_t xdr_buf_pagecount(struct xdr_buf *buf); +void xdr_terminate_string(const struct xdr_buf *, const u32); +size_t xdr_buf_pagecount(const struct xdr_buf *buf); int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp); void xdr_free_bvec(struct xdr_buf *buf); @@ -182,14 +182,14 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) * XDR buffer helper functions */ extern void xdr_shift_buf(struct xdr_buf *, size_t); -extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); -extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); +extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *); +extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); extern void xdr_buf_trim(struct xdr_buf *, unsigned int); -extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); -extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); +extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); +extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); -extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32); -extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *); +extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32); +extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *); struct xdr_array2_desc; typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem); @@ -200,9 +200,9 @@ struct xdr_array2_desc { xdr_xcode_elem_t xcode; }; -extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base, +extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); -extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, +extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len); @@ -251,7 +251,7 @@ extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buf extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); -extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length); extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index f0444bf5617c..2e91fbd70f11 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); * @len: length of string, in bytes * */ -void -xdr_terminate_string(struct xdr_buf *buf, const u32 len) +void xdr_terminate_string(const struct xdr_buf *buf, const u32 len) { char *kaddr; @@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); -size_t -xdr_buf_pagecount(struct xdr_buf *buf) +size_t xdr_buf_pagecount(const struct xdr_buf *buf) { if (!buf->page_len) return 0; @@ -1545,8 +1543,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page); static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; -void -xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf) { buf->head[0] = *iov; buf->tail[0] = empty_iov; @@ -1569,9 +1566,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov); * * Returns -1 if base of length are out of bounds. */ -int -xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, - unsigned int base, unsigned int len) +int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, + unsigned int base, unsigned int len) { subbuf->buflen = subbuf->len = len; if (base < buf->head[0].iov_len) { @@ -1659,7 +1655,8 @@ fix_len: } EXPORT_SYMBOL_GPL(xdr_buf_trim); -static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1676,7 +1673,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne } /* obj is assumed to point to allocated memory of size at least len: */ -int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1689,7 +1687,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u } EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); -static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1706,7 +1705,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned } /* obj is assumed to point to allocated memory of size at least len: */ -int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1719,8 +1719,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un } EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); -int -xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj) { __be32 raw; int status; @@ -1733,8 +1732,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) } EXPORT_SYMBOL_GPL(xdr_decode_word); -int -xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) +int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj) { __be32 raw = cpu_to_be32(obj); @@ -1743,9 +1741,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) EXPORT_SYMBOL_GPL(xdr_encode_word); /* Returns 0 on success, or else a negative error code. */ -static int -xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc, int encode) +static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc, int encode) { char *elem = NULL, *c; unsigned int copied = 0, todo, avail_here; @@ -1937,9 +1934,8 @@ out: return err; } -int -xdr_decode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if (base >= buf->len) return -EINVAL; @@ -1948,9 +1944,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_decode_array2); -int -xdr_encode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->head->iov_len + buf->page_len + buf->tail->iov_len) @@ -1960,9 +1955,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_encode_array2); -int -xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, - int (*actor)(struct scatterlist *, void *), void *data) +int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, + unsigned int len, + int (*actor)(struct scatterlist *, void *), void *data) { int i, ret = 0; unsigned int page_len, thislen, page_offset; From 5802f7c2a6b876b2810e3e9f26d719961f12e251 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Dec 2020 08:55:35 -0500 Subject: [PATCH 78/90] SUNRPC: When expanding the buffer, we may need grow the sparse pages If we're shifting the page data to the right, and this happens to be a sparse page array, then we may need to allocate new pages in order to receive the data. Reported-by: "Mkrtchyan, Tigran" Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 2e91fbd70f11..60d4442c5273 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -478,16 +478,47 @@ static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, } while ((len -= zero) != 0); } +static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf, + unsigned int buflen, gfp_t gfp) +{ + unsigned int i, npages, pagelen; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return buflen; + if (buflen <= buf->head->iov_len) + return buflen; + pagelen = buflen - buf->head->iov_len; + if (pagelen > buf->page_len) + pagelen = buf->page_len; + npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (!buf->pages[i]) + continue; + buf->pages[i] = alloc_page(gfp); + if (likely(buf->pages[i])) + continue; + buflen -= pagelen; + pagelen = i << PAGE_SHIFT; + if (pagelen > buf->page_base) + buflen += pagelen - buf->page_base; + break; + } + return buflen; +} + static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) { struct kvec *head = buf->head; struct kvec *tail = buf->tail; unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; - unsigned int free_space; + unsigned int free_space, newlen; if (sum > buf->len) { free_space = min_t(unsigned int, sum - buf->len, len); - buf->len += free_space; + newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space, + GFP_KERNEL); + free_space = newlen - buf->len; + buf->len = newlen; len -= free_space; if (!len) return; From 1ee6310119a5b4460324111a8c4536054356b963 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 13:11:11 -0500 Subject: [PATCH 79/90] NFSv4.2: Ensure we always reset the result->count in decode_read_plus() Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index c0b8fcd266c9..1c21db640f4d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1087,6 +1087,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (unlikely(!p)) return -EIO; + res->count = 0; eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) From 5c4afe2ab624cb8156e987ff929e00632fb56aeb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 07:40:04 -0500 Subject: [PATCH 80/90] NFSv4.2: decode_read_plus_data() must skip padding after data segment All XDR opaque object sizes are 32-bit aligned, and a data segment is no exception. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 1c21db640f4d..4c6bce3dbaeb 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1038,7 +1038,9 @@ static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *re p = xdr_decode_hyper(p, &offset); count = be32_to_cpup(p); - recvd = xdr_align_data(xdr, res->count, count); + recvd = xdr_align_data(xdr, res->count, xdr_align_size(count)); + if (recvd > count) + recvd = count; res->count += recvd; if (count > recvd) { From 82f98c8b116bd769a47688ca5227f94826ae8a2a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 08:14:55 -0500 Subject: [PATCH 81/90] NFSv4.2: decode_read_plus_hole() needs to check the extent offset The server is allowed to return a hole extent with an offset that starts before the offset supplied in the READ_PLUS argument. Ensure that we support that case too. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 4c6bce3dbaeb..f9faa131a4f5 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1053,8 +1053,9 @@ static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *re return 0; } -static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res, - uint32_t *eof) +static int decode_read_plus_hole(struct xdr_stream *xdr, + struct nfs_pgio_args *args, + struct nfs_pgio_res *res, uint32_t *eof) { uint64_t offset, length, recvd; __be32 *p; @@ -1065,6 +1066,20 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re p = xdr_decode_hyper(p, &offset); p = xdr_decode_hyper(p, &length); + if (offset != args->offset + res->count) { + /* Server returned an out-of-sequence extent */ + if (offset > args->offset + res->count || + offset + length < args->offset + res->count) { + dprintk("NFS: server returned out of sequence extent: " + "offset/size = %llu/%llu != expected %llu\n", + (unsigned long long)offset, + (unsigned long long)length, + (unsigned long long)(args->offset + + res->count)); + return 1; + } + length -= args->offset + res->count - offset; + } recvd = xdr_expand_hole(xdr, res->count, length); res->count += recvd; @@ -1077,6 +1092,9 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) { + struct nfs_pgio_header *hdr = + container_of(res, struct nfs_pgio_header, res); + struct nfs_pgio_args *args = &hdr->args; uint32_t eof, segments, type; int status, i; __be32 *p; @@ -1104,7 +1122,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (type == NFS4_CONTENT_DATA) status = decode_read_plus_data(xdr, res, &eof); else if (type == NFS4_CONTENT_HOLE) - status = decode_read_plus_hole(xdr, res, &eof); + status = decode_read_plus_hole(xdr, args, res, &eof); else return -EINVAL; From dac3b1059b499c570f02cd94f3172d8c8df3a9dd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 08:41:01 -0500 Subject: [PATCH 82/90] NFSv4.2: Handle hole lengths that exceed the READ_PLUS read buffer If a hole extends beyond the READ_PLUS read buffer, then we want to fill just the remaining buffer with zeros. Also ignore eof... Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index f9faa131a4f5..6ba2a28e7e03 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1080,6 +1080,12 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, } length -= args->offset + res->count - offset; } + if (length + res->count > args->count) { + *eof = 0; + if (unlikely(res->count >= args->count)) + return 1; + length = args->count - res->count; + } recvd = xdr_expand_hole(xdr, res->count, length); res->count += recvd; From 503b934a752f7e789a5f33217520e0a79f3096ac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 07:51:29 -0500 Subject: [PATCH 83/90] NFSv4.2: Don't error when exiting early on a READ_PLUS buffer overflow Expanding the READ_PLUS extents can cause the read buffer to overflow. If it does, then don't error, but just exit early. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6ba2a28e7e03..9ef5261a1a70 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1025,16 +1025,16 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re return decode_op_hdr(xdr, OP_DEALLOCATE); } -static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res, - uint32_t *eof) +static int decode_read_plus_data(struct xdr_stream *xdr, + struct nfs_pgio_res *res) { uint32_t count, recvd; uint64_t offset; __be32 *p; p = xdr_inline_decode(xdr, 8 + 4); - if (unlikely(!p)) - return -EIO; + if (!p) + return 1; p = xdr_decode_hyper(p, &offset); count = be32_to_cpup(p); @@ -1043,13 +1043,8 @@ static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *re recvd = count; res->count += recvd; - if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - *eof = 0; + if (count > recvd) return 1; - } - return 0; } @@ -1061,8 +1056,8 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 8 + 8); - if (unlikely(!p)) - return -EIO; + if (!p) + return 1; p = xdr_decode_hyper(p, &offset); p = xdr_decode_hyper(p, &length); @@ -1089,10 +1084,8 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, recvd = xdr_expand_hole(xdr, res->count, length); res->count += recvd; - if (recvd < length) { - *eof = 0; + if (recvd < length) return 1; - } return 0; } @@ -1121,12 +1114,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) for (i = 0; i < segments; i++) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; + if (!p) + goto early_out; type = be32_to_cpup(p++); if (type == NFS4_CONTENT_DATA) - status = decode_read_plus_data(xdr, res, &eof); + status = decode_read_plus_data(xdr, res); else if (type == NFS4_CONTENT_HOLE) status = decode_read_plus_hole(xdr, args, res, &eof); else @@ -1135,12 +1128,17 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (status < 0) return status; if (status > 0) - break; + goto early_out; } out: res->eof = eof; return 0; +early_out: + if (unlikely(!i)) + return -EIO; + res->eof = 0; + return 0; } static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) From 7aedc687c9f62e0d22b3231a100030e02344be1a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Dec 2020 09:03:51 -0500 Subject: [PATCH 84/90] NFSv4.2: Deal with potential READ_PLUS data extent buffer overflow If the server returns more data than we have buffer space for, then we need to truncate and exit early. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 9ef5261a1a70..8386ca45a43f 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1026,6 +1026,7 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re } static int decode_read_plus_data(struct xdr_stream *xdr, + struct nfs_pgio_args *args, struct nfs_pgio_res *res) { uint32_t count, recvd; @@ -1041,8 +1042,12 @@ static int decode_read_plus_data(struct xdr_stream *xdr, recvd = xdr_align_data(xdr, res->count, xdr_align_size(count)); if (recvd > count) recvd = count; + if (res->count + recvd > args->count) { + if (args->count > res->count) + res->count += args->count - res->count; + return 1; + } res->count += recvd; - if (count > recvd) return 1; return 0; @@ -1119,7 +1124,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) type = be32_to_cpup(p++); if (type == NFS4_CONTENT_DATA) - status = decode_read_plus_data(xdr, res); + status = decode_read_plus_data(xdr, args, res); else if (type == NFS4_CONTENT_HOLE) status = decode_read_plus_hole(xdr, args, res, &eof); else From 5c3485bb12c90945f86d6b1c901bbe76aa8b45c9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Dec 2020 09:34:34 -0500 Subject: [PATCH 85/90] NFSv4.2/pnfs: Don't use READ_PLUS with pNFS yet We have no way of tracking server READ_PLUS support in pNFS for now, so just disable it. Reported-by: "Mkrtchyan, Tigran" Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7ab40d0e6a74..61a07dcb963d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5320,17 +5320,17 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) } #ifdef CONFIG_NFS_V4_2 -static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) +static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { - if (server->caps & NFS_CAP_READ_PLUS) + /* Note: We don't use READ_PLUS with pNFS yet */ + if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; - else - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } #else -static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) +static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } #endif /* CONFIG_NFS_V4_2 */ @@ -5340,7 +5340,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; - nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg); + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs42_read_plus_support(hdr, msg); nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } From 7be9b38afafbfcc58ede3be66bfc4ea415b3d5f1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 16 Dec 2020 12:25:13 +0000 Subject: [PATCH 86/90] NFSv4.2: fix error return on memory allocation failure Currently when an alloc_page fails the error return is not set in variable err and a garbage initialized value is returned. Fix this by setting err to -ENOMEM before taking the error return path. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: a1f26739ccdc ("NFSv4.2: improve page handling for GETXATTR") Signed-off-by: Colin Ian King Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index b9836e2ce4a2..f3fd935620fc 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1301,6 +1301,7 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) { np = i + 1; + err = -ENOMEM; goto out; } } From 3316fb80a0b4c1fef03a3eb1a7f0651e2133c429 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Fri, 11 Dec 2020 16:41:58 +0800 Subject: [PATCH 87/90] fs/lockd: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Zheng Yongjun Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 771c289f6df7..f802223e71ab 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -163,7 +163,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_nsmhandle = nsm; host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; - host->h_cred = get_cred(ni->cred), + host->h_cred = get_cred(ni->cred); strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: From cac1d3a2b8f7f0817ac4feab76f5d3b12e4b02d7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Dec 2020 16:31:26 -0500 Subject: [PATCH 88/90] NFSv4/pnfs: Add tracing for the deviceid cache Add tracepoints to allow debugging of the deviceid cache. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 ++ fs/nfs/nfs4trace.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/pnfs_dev.c | 23 +++++++++----- 3 files changed, 92 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 61a07dcb963d..8e03a647c61a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9662,6 +9662,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, if (res.notification != args.notify_types) pdev->nocache = 1; + trace_nfs4_getdeviceinfo(server, &pdev->dev_id, status); + dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 484c1da96dea..48d761e593fb 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2189,6 +2189,81 @@ DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done); DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist); DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist); +DECLARE_EVENT_CLASS(nfs4_deviceid_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs4_deviceid *deviceid + ), + + TP_ARGS(clp, deviceid), + + TP_STRUCT__entry( + __string(dstaddr, clp->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + ), + + TP_fast_assign( + __assign_str(dstaddr, clp->cl_hostname); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + + TP_printk( + "deviceid=%s, dstaddr=%s", + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(dstaddr) + ) +); +#define DEFINE_PNFS_DEVICEID_EVENT(name) \ + DEFINE_EVENT(nfs4_deviceid_event, name, \ + TP_PROTO(const struct nfs_client *clp, \ + const struct nfs4_deviceid *deviceid \ + ), \ + TP_ARGS(clp, deviceid)) +DEFINE_PNFS_DEVICEID_EVENT(nfs4_deviceid_free); + +DECLARE_EVENT_CLASS(nfs4_deviceid_status, + TP_PROTO( + const struct nfs_server *server, + const struct nfs4_deviceid *deviceid, + int status + ), + + TP_ARGS(server, deviceid, status), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, status) + __string(dstaddr, server->nfs_client->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + ), + + TP_fast_assign( + __entry->dev = server->s_dev; + __entry->status = status; + __assign_str(dstaddr, server->nfs_client->cl_hostname); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + + TP_printk( + "dev=%02x:%02x: deviceid=%s, dstaddr=%s, status=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(dstaddr), + __entry->status + ) +); +#define DEFINE_PNFS_DEVICEID_STATUS(name) \ + DEFINE_EVENT(nfs4_deviceid_status, name, \ + TP_PROTO(const struct nfs_server *server, \ + const struct nfs4_deviceid *deviceid, \ + int status \ + ), \ + TP_ARGS(server, deviceid, status)) +DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo); +DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid); + DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( const struct nfs_pgio_header *hdr diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 537b80d693f1..ddbbf4fcda86 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -34,6 +34,8 @@ #include "internal.h" #include "pnfs.h" +#include "nfs4trace.h" + #define NFSDBG_FACILITY NFSDBG_PNFS /* @@ -192,24 +194,28 @@ nfs4_find_get_deviceid(struct nfs_server *server, d = __nfs4_find_get_deviceid(server, id, hash); if (d) - return d; + goto found; new = nfs4_get_device_info(server, id, cred, gfp_mask); - if (!new) + if (!new) { + trace_nfs4_find_deviceid(server, id, -ENOENT); return new; + } spin_lock(&nfs4_deviceid_lock); d = __nfs4_find_get_deviceid(server, id, hash); if (d) { spin_unlock(&nfs4_deviceid_lock); server->pnfs_curr_ld->free_deviceid_node(new); - return d; + } else { + atomic_inc(&new->ref); + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + spin_unlock(&nfs4_deviceid_lock); + d = new; } - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); - atomic_inc(&new->ref); - spin_unlock(&nfs4_deviceid_lock); - - return new; +found: + trace_nfs4_find_deviceid(server, id, 0); + return d; } EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); @@ -278,6 +284,7 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) } if (!atomic_dec_and_test(&d->ref)) return false; + trace_nfs4_deviceid_free(d->nfs_client, &d->deviceid); d->ld->free_deviceid_node(d); return true; } From 9bfffea3524b49d0268d01f8e7967f06c4d0a942 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Dec 2020 14:01:07 -0500 Subject: [PATCH 89/90] pNFS/flexfiles: Avoid spurious layout returns in ff_layout_choose_ds_for_read The callers of ff_layout_choose_ds_for_read() should decide whether or not they want to return the layout on error. Sometimes, we may just want to retry from the beginning. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b7c26836b2cb..439169301d56 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -740,16 +740,12 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - bool fail_return = false; u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - if (idx+1 == fls->mirror_array_cnt) - fail_return = !check_device; - mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); if (!ds) continue; From 52104f274e2d7f134d34bab11cada8913d4544e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Dec 2020 17:17:45 -0500 Subject: [PATCH 90/90] NFS/pNFS: Fix a typo in ff_layout_resend_pnfs_read() Don't bump the index twice. Fixes: 563c53e73b8b ("NFS: Fix flexfiles read failover") Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 439169301d56..43ed743d10ea 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1052,7 +1052,7 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) + if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) ff_layout_send_layouterror(hdr->lseg); else pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);