From c7e0b781b73c2e26e442ed71397cc2bc5945a732 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Jun 2021 16:34:20 -0400 Subject: [PATCH 01/31] NFSD: Clean up splice actor A few useful observations: - The value in @size is never modified. - splice_desc.len is an unsigned int, and so is xdr_buf.page_len. An implicit cast to size_t is unnecessary. - The computation of .page_len is the same in all three arms of the "if" statement, so hoist it out to make it clear that the operation is an unconditional invariant. The resulting function is 18 bytes shorter on my system (-Os). Signed-off-by: Chuck Lever Reviewed-by: NeilBrown --- fs/nfsd/vfs.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a224a5e23cc1..46a6d9fce3d2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -847,26 +847,21 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct svc_rqst *rqstp = sd->u.data; struct page **pp = rqstp->rq_next_page; struct page *page = buf->page; - size_t size; - - size = sd->len; if (rqstp->rq_res.page_len == 0) { get_page(page); put_page(*rqstp->rq_next_page); *(rqstp->rq_next_page++) = page; rqstp->rq_res.page_base = buf->offset; - rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { get_page(page); if (*rqstp->rq_next_page) put_page(*rqstp->rq_next_page); *(rqstp->rq_next_page++) = page; - rqstp->rq_res.page_len += size; - } else - rqstp->rq_res.page_len += size; + } + rqstp->rq_res.page_len += sd->len; - return size; + return sd->len; } static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, From 2f0f88f42f2eab0421ed37d7494de9124fdf0d34 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Jul 2021 10:03:10 -0400 Subject: [PATCH 02/31] SUNRPC: Add svc_rqst_replace_page() API Replacing a page in rq_pages[] requires a get_page(), which is a bus-locked operation, and a put_page(), which can be even more costly. To reduce the cost of replacing a page in rq_pages[], batch the put_page() operations by collecting "freed" pages in a pagevec, and then release those pages when the pagevec is full. This pagevec is also emptied when each RPC completes. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++++ net/sunrpc/svc.c | 21 +++++++++++++++++++++ net/sunrpc/svc_xprt.c | 3 +++ 3 files changed, 28 insertions(+) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e91d51ea028b..ab9afbf0a0d8 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -19,6 +19,7 @@ #include #include #include +#include /* statistics for svc_pool structures */ struct svc_pool_stats { @@ -256,6 +257,7 @@ struct svc_rqst { struct page * *rq_next_page; /* next reply page to use */ struct page * *rq_page_end; /* one past the last page */ + struct pagevec rq_pvec; struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */ struct bio_vec rq_bvec[RPCSVC_MAXPAGES]; @@ -502,6 +504,8 @@ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node); +void svc_rqst_replace_page(struct svc_rqst *rqstp, + struct page *page); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); unsigned int svc_pool_map_get(void); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0de918cb3d90..d2d412d43827 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -838,6 +838,27 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser } EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); +/** + * svc_rqst_replace_page - Replace one page in rq_pages[] + * @rqstp: svc_rqst with pages to replace + * @page: replacement page + * + * When replacing a page in rq_pages, batch the release of the + * replaced pages to avoid hammering the page allocator. + */ +void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +{ + if (*rqstp->rq_next_page) { + if (!pagevec_space(&rqstp->rq_pvec)) + __pagevec_release(&rqstp->rq_pvec); + pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + } + + get_page(page); + *(rqstp->rq_next_page++) = page; +} +EXPORT_SYMBOL_GPL(svc_rqst_replace_page); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d66a8e44a1ae..682058a5ec13 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -539,6 +539,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; + pagevec_release(&rqstp->rq_pvec); svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -664,6 +665,8 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) struct xdr_buf *arg = &rqstp->rq_arg; unsigned long pages, filled; + pagevec_init(&rqstp->rq_pvec); + pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", From 496d83cf0f2fa70cfe256c2499e2d3523d3868f3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 28 Jun 2021 17:24:27 -0400 Subject: [PATCH 03/31] NFSD: Batch release pages during splice read Large splice reads call put_page() repeatedly. put_page() is relatively expensive to call, so replace it with the new svc_rqst_replace_page() helper to help amortize that cost. Signed-off-by: Chuck Lever Reviewed-by: NeilBrown --- fs/nfsd/vfs.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 46a6d9fce3d2..7732a384f949 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -849,15 +849,10 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct page *page = buf->page; if (rqstp->rq_res.page_len == 0) { - get_page(page); - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; + svc_rqst_replace_page(rqstp, page); rqstp->rq_res.page_base = buf->offset; } else if (page != pp[-1]) { - get_page(page); - if (*rqstp->rq_next_page) - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; + svc_rqst_replace_page(rqstp, page); } rqstp->rq_res.page_len += sd->len; From 883b4aee4dec64bc807a7dda4651c6a5efe9a74d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 16 Jul 2021 20:55:10 -0400 Subject: [PATCH 04/31] tracing: Add trace_event helper macros __string_len() and __assign_str_len() There's a few cases that a string that is to be recorded in a trace event, does not have a terminating 'nul' character, and instead, the tracepoint passes in the length of the string to record. Add two helper macros to the trace event code that lets this work easier, than tricks with "%.*s" logic. __string_len() which is similar to __string() for declaration, but takes a length argument. __assign_str_len() which is similar to __assign_str() for assiging the string, but it too takes a length argument. Note, the TRACE_EVENT() macro will allocate the location on the ring buffer to 'len + 1', that will be used to store the string into. It is a requirement that the 'len' used for this is a most the length of the string being recorded. This string can still use __get_str() just like strings created with __string() can use to retrieve the string. Link: https://lore.kernel.org/linux-nfs/20210513105018.7539996a@gandalf.local.home/ Tested-by: Chuck Lever Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Chuck Lever --- include/trace/trace_events.h | 22 ++++++++++++++++++ samples/trace_events/trace-events-sample.h | 27 ++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index acc17194c160..08810a463880 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -102,6 +102,9 @@ TRACE_MAKE_SYSTEM_STR(); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) @@ -197,6 +200,9 @@ TRACE_MAKE_SYSTEM_STR(); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) @@ -459,6 +465,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) @@ -507,6 +516,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ #define __string(item, src) __dynamic_array(char, item, \ strlen((src) ? (const char *)(src) : "(null)") + 1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) + /* * __bitmask_size_in_bytes_raw is the number of bytes needed to hold * num_possible_cpus(). @@ -670,10 +682,20 @@ static inline notrace int trace_event_get_offsets_##call( \ #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __string_len +#define __string_len(item, src, len) __dynamic_array(char, item, -1) + #undef __assign_str #define __assign_str(dst, src) \ strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); +#undef __assign_str_len +#define __assign_str_len(dst, src, len) \ + do { \ + memcpy(__get_str(dst), (src), (len)); \ + __get_str(dst)[len] = '\0'; \ + } while(0) + #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 13a35f7cbe66..e61471ab7d14 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -141,6 +141,33 @@ * In most cases, the __assign_str() macro will take the same * parameters as the __string() macro had to declare the string. * + * __string_len: This is a helper to a __dynamic_array, but it understands + * that the array has characters in it, and with the combined + * use of __assign_str_len(), it will allocate 'len' + 1 bytes + * in the ring buffer and add a '\0' to the string. This is + * useful if the string being saved has no terminating '\0' byte. + * It requires that the length of the string is known as it acts + * like a memcpy(). + * + * Declared with: + * + * __string_len(foo, bar, len) + * + * To assign this string, use the helper macro __assign_str_len(). + * + * __assign_str(foo, bar, len); + * + * Then len + 1 is allocated to the ring buffer, and a nul terminating + * byte is added. This is similar to: + * + * memcpy(__get_str(foo), bar, len); + * __get_str(foo)[len] = 0; + * + * The advantage of using this over __dynamic_array, is that it + * takes care of allocating the extra byte on the ring buffer + * for the '\0' terminating byte, and __get_str(foo) can be used + * in the TP_printk(). + * * __bitmask: This is another kind of __dynamic_array, but it expects * an array of longs, and the number of bits to parse. It takes * two parameters (name, nr_bits), where name is the name of the From 408c0de706186bb11aaed87cf86d96d7776d3b6f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 12 May 2021 09:39:06 -0400 Subject: [PATCH 05/31] NFSD: Use new __string_len C macros for the nfs_dirent tracepoint Clean up. Signed-off-by: Chuck Lever --- fs/nfsd/trace.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index adaec43548d1..52a43acd546c 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -400,18 +400,16 @@ TRACE_EVENT(nfsd_dirent, TP_STRUCT__entry( __field(u32, fh_hash) __field(u64, ino) - __field(int, len) - __dynamic_array(unsigned char, name, namlen) + __string_len(name, name, namlen) ), TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __entry->len = namlen; - memcpy(__get_str(name), name, namlen); + __assign_str_len(name, name, namlen) ), - TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", - __entry->fh_hash, __entry->ino, - __entry->len, __get_str(name)) + TP_printk("fh_hash=0x%08x ino=%llu name=%s", + __entry->fh_hash, __entry->ino, __get_str(name) + ) ) #include "state.h" From d27b74a8675ca34dfd54c4bc4b3a11b7aa87e1a3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 May 2021 15:34:57 -0400 Subject: [PATCH 06/31] NFSD: Use new __string_len C macros for nfsd_clid_class Clean up. Signed-off-by: Chuck Lever --- fs/nfsd/trace.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 52a43acd546c..538520957a81 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -606,7 +606,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __array(unsigned char, addr, sizeof(struct sockaddr_in6)) __field(unsigned long, flavor) __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) - __dynamic_array(char, name, clp->cl_name.len + 1) + __string_len(name, name, clp->cl_name.len) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -616,8 +616,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len); - __get_str(name)[clp->cl_name.len] = '\0'; + __assign_str_len(name, clp->cl_name.data, clp->cl_name.len); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), From cd2d644ddba183ec7b451b7c20d5c7cc06fcf0d7 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 26 Jul 2021 09:33:28 -0400 Subject: [PATCH 07/31] lockd: Fix invalid lockowner cast after vfs_test_lock After calling vfs_test_lock() the pointer to a conflicting lock can be returned, and that lock is not guarunteed to be owned by nlm. In that case, we cannot cast it to struct nlm_lockowner. Instead return the pid of that conflicting lock. Fixes: 646d73e91b42 ("lockd: Show pid of lockd for remote locks") Signed-off-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 61d3cc2283dc..498cb70c2c0d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -634,7 +634,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + conflock->svid = lock->fl.fl_pid; conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; From 6c8c84f525100a1cade5698320b4abe43062e159 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 7 Jul 2021 14:57:28 -0400 Subject: [PATCH 08/31] svcrdma: Fewer calls to wake_up() in Send completion handler Because wake_up() takes an IRQ-safe lock, it can be expensive, especially to call inside of a single-threaded completion handler. What's more, the Send wait queue almost never has waiters, so most of the time, this is an expensive no-op. As always, the goal is to reduce the average overhead of each completion, because a transport's completion handlers are single- threaded on one CPU core. This change reduces CPU utilization of the Send completion thread by 2-3% on my server. Signed-off-by: Chuck Lever Reviewed-By: Tom Talpey --- include/linux/sunrpc/svc_rdma.h | 1 + net/sunrpc/xprtrdma/svc_rdma_rw.c | 7 ++----- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 3184465de3a0..57c60ffe76dd 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -207,6 +207,7 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, struct svc_rdma_recv_ctxt *rctxt, int status); +extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 1e651447dc4e..3d1b119f6e3e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -248,8 +248,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); @@ -304,9 +303,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); cc->cc_status = wc->status; complete(&cc->cc_done); return; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d6bbafb773e1..fba2ee4eb607 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -258,6 +258,20 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, spin_unlock(&rdma->sc_send_lock); } +/** + * svc_rdma_wake_send_waiters - manage Send Queue accounting + * @rdma: controlling transport + * @avail: Number of additional SQEs that are now available + * + */ +void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) +{ + atomic_add(avail, &rdma->sc_sq_avail); + smp_mb__after_atomic(); + if (unlikely(waitqueue_active(&rdma->sc_send_wait))) + wake_up(&rdma->sc_send_wait); +} + /** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: Completion Queue context @@ -275,11 +289,9 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + svc_rdma_wake_send_waiters(rdma, 1); complete(&ctxt->sc_done); - atomic_inc(&rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); } From b6c2bfea096ba22583f1071c10ce0745804b9b95 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 9 Feb 2021 10:32:20 -0500 Subject: [PATCH 09/31] svcrdma: Relieve contention on sc_send_lock. /proc/lock_stat indicates the the sc_send_lock is heavily contended when the server is under load from a single client. To address this, convert the send_ctxt free list to an llist. Returning an item to the send_ctxt cache is now waitless, which reduces the instruction path length in the single-threaded Send handler (svc_rdma_wc_send). The goal is to enable the ib_comp_wq worker to handle a higher RPC/RDMA Send completion rate given the same CPU resources. This change reduces CPU utilization of Send completion by 2-3% on my server. Signed-off-by: Chuck Lever Reviewed-By: Tom Talpey --- include/linux/sunrpc/svc_rdma.h | 4 ++-- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 23 ++++++++--------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 57c60ffe76dd..5f8d5af6556c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -90,7 +90,7 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; spinlock_t sc_send_lock; - struct list_head sc_send_ctxts; + struct llist_head sc_send_ctxts; spinlock_t sc_rw_ctxt_lock; struct list_head sc_rw_ctxts; @@ -150,7 +150,7 @@ struct svc_rdma_recv_ctxt { }; struct svc_rdma_send_ctxt { - struct list_head sc_list; + struct llist_node sc_node; struct rpc_rdma_cid sc_cid; struct ib_send_wr sc_send_wr; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index fba2ee4eb607..599021b2391d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -113,13 +113,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static inline struct svc_rdma_send_ctxt * -svc_rdma_next_send_ctxt(struct list_head *list) -{ - return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, - sc_list); -} - static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, struct rpc_rdma_cid *cid) { @@ -182,9 +175,10 @@ fail0: void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { - list_del(&ctxt->sc_list); + while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); ib_dma_unmap_single(rdma->sc_pd->device, ctxt->sc_sges[0].addr, rdma->sc_max_req_size, @@ -204,12 +198,13 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_send_lock); - ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_send_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->sc_list); + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); spin_unlock(&rdma->sc_send_lock); out: @@ -253,9 +248,7 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, ctxt->sc_sges[i].length); } - spin_lock(&rdma->sc_send_lock); - list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); - spin_unlock(&rdma->sc_send_lock); + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d94b7759ada1..99474078c304 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -136,7 +136,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); - INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); From 07a92d009f0b1557d3d58905ce18821a483be2e1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 8 Feb 2021 15:33:16 -0500 Subject: [PATCH 10/31] svcrdma: Convert rdma->sc_rw_ctxts to llist Relieve contention on sc_rw_ctxt_lock by converting rdma->sc_rw_ctxts to an llist. The goal is to reduce the average overhead of Send completions, because a transport's completion handlers are single-threaded on one CPU core. This change reduces CPU utilization of each Send completion by 2-3% on my server. Signed-off-by: Chuck Lever Reviewed-By: Tom Talpey --- include/linux/sunrpc/svc_rdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma_rw.c | 49 +++++++++++++++++------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 5f8d5af6556c..24aa159d29a7 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -92,7 +92,7 @@ struct svcxprt_rdma { spinlock_t sc_send_lock; struct llist_head sc_send_ctxts; spinlock_t sc_rw_ctxt_lock; - struct list_head sc_rw_ctxts; + struct llist_head sc_rw_ctxts; u32 sc_pending_recvs; u32 sc_recv_batch; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 3d1b119f6e3e..e27433f08ca7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -35,6 +35,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); * controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { + struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; @@ -53,19 +54,19 @@ static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_rw_ctxt_lock); - - ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); - if (ctxt) { - list_del(&ctxt->rw_list); - spin_unlock(&rdma->sc_rw_ctxt_lock); + node = llist_del_first(&rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); + if (node) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - spin_unlock(&rdma->sc_rw_ctxt_lock); ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) goto out_noctx; + INIT_LIST_HEAD(&ctxt->rw_list); } @@ -83,14 +84,18 @@ out_noctx: return NULL; } +static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) +{ + sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + llist_add(&ctxt->rw_node, list); +} + static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt) { - sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); - - spin_lock(&rdma->sc_rw_ctxt_lock); - list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); - spin_unlock(&rdma->sc_rw_ctxt_lock); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); } /** @@ -101,9 +106,10 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { - list_del(&ctxt->rw_list); + while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); kfree(ctxt); } } @@ -171,20 +177,35 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, cc->cc_sqecount = 0; } +/* + * The consumed rw_ctx's are cleaned and placed on a local llist so + * that only one atomic llist operation is needed to put them all + * back on the free list. + */ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; + struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; + LLIST_HEAD(free); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - svc_rdma_put_rw_ctxt(rdma, ctxt); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + + ctxt->rw_node.next = first; + first = &ctxt->rw_node; + if (!last) + last = first; } + if (first) + llist_add_batch(first, last, &rdma->sc_rw_ctxts); } /* State for sending a Write or Reply chunk. diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 99474078c304..d1faa522c3dd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -138,7 +138,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); From ea49dc79002c416a9003f3204bc14f846a0dbcae Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 28 Jul 2021 08:56:09 +1000 Subject: [PATCH 11/31] NFSD: remove vanity comments Including one's name in copyright claims is appropriate. Including it in random comments is just vanity. After 2 decades, it is time for these to be gone. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 1 - include/uapi/linux/nfsd/nfsfh.h | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7732a384f949..7851cf30a75d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -244,7 +244,6 @@ out_nfserr: * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 - * NeilBrown */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h index 427294dd56a1..e29e8accc4f4 100644 --- a/include/uapi/linux/nfsd/nfsfh.h +++ b/include/uapi/linux/nfsd/nfsfh.h @@ -33,7 +33,6 @@ struct nfs_fhbase_old { /* * This is the new flexible, extensible style NFSv2/v3/v4 file handle. - * by Neil Brown - March 2000 * * The file handle starts with a sequence of four-byte words. * The first word contains a version number (1) and three descriptor bytes From 5c11720767f70d34357d00a15ba5a0ad052c40fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 5 Aug 2021 15:11:24 -0400 Subject: [PATCH 12/31] SUNRPC: Fix a NULL pointer deref in trace_svc_stats_latency() Some paths through svc_process() leave rqst->rq_procinfo set to NULL, which triggers a crash if tracing happens to be enabled. Fixes: 89ff87494c6e ("SUNRPC: Display RPC procedure names instead of proc numbers") Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 1 + include/trace/events/sunrpc.h | 8 ++++---- net/sunrpc/svc.c | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index ab9afbf0a0d8..f0f846fa396e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -527,6 +527,7 @@ void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); char * svc_print_addr(struct svc_rqst *, char *, size_t); +const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 861f199896c6..d323f5a049c8 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1642,7 +1642,7 @@ TRACE_EVENT(svc_process, __field(u32, vers) __field(u32, proc) __string(service, name) - __string(procedure, rqst->rq_procinfo->pc_name) + __string(procedure, svc_proc_name(rqst)) __string(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)") ), @@ -1652,7 +1652,7 @@ TRACE_EVENT(svc_process, __entry->vers = rqst->rq_vers; __entry->proc = rqst->rq_proc; __assign_str(service, name); - __assign_str(procedure, rqst->rq_procinfo->pc_name); + __assign_str(procedure, svc_proc_name(rqst)); __assign_str(addr, rqst->rq_xprt ? rqst->rq_xprt->xpt_remotebuf : "(null)"); ), @@ -1918,7 +1918,7 @@ TRACE_EVENT(svc_stats_latency, TP_STRUCT__entry( __field(u32, xid) __field(unsigned long, execute) - __string(procedure, rqst->rq_procinfo->pc_name) + __string(procedure, svc_proc_name(rqst)) __string(addr, rqst->rq_xprt->xpt_remotebuf) ), @@ -1926,7 +1926,7 @@ TRACE_EVENT(svc_stats_latency, __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->execute = ktime_to_us(ktime_sub(ktime_get(), rqst->rq_stime)); - __assign_str(procedure, rqst->rq_procinfo->pc_name); + __assign_str(procedure, svc_proc_name(rqst)); __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d2d412d43827..5aa263326b6a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1650,6 +1650,21 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) } EXPORT_SYMBOL_GPL(svc_max_payload); +/** + * svc_proc_name - Return RPC procedure name in string form + * @rqstp: svc_rqst to operate on + * + * Return value: + * Pointer to a NUL-terminated string + */ +const char *svc_proc_name(const struct svc_rqst *rqstp) +{ + if (rqstp && rqstp->rq_procinfo) + return rqstp->rq_procinfo->pc_name; + return "unknown"; +} + + /** * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on From a2071573d6346819cc4e5787b4206f2184985160 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 3 Aug 2021 12:59:36 +0200 Subject: [PATCH 13/31] sysctl: introduce new proc handler proc_dobool This is to let bool variable could be correctly displayed in big/little endian sysctl procfs. sizeof(bool) is arch dependent, proc_dobool should work in all arches. Suggested-by: Pan Xinhui Signed-off-by: Jia He [thuth: rebased the patch to the current kernel version] Signed-off-by: Thomas Huth Signed-off-by: Chuck Lever --- include/linux/sysctl.h | 2 ++ kernel/sysctl.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d99ca99837de..1fa2b69c6fc3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -48,6 +48,8 @@ typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 272f4a272f8c..25e49b4d8049 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -536,6 +536,21 @@ static void proc_put_char(void **buf, size_t *size, char c) } } +static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *(bool *)valp = *lvalp; + } else { + int val = *(bool *)valp; + + *lvalp = (unsigned long)val; + *negp = false; + } + return 0; +} + static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) @@ -798,6 +813,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +/** + * proc_dobool - read/write a bool + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dobool(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dobool_conv, NULL); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1630,6 +1665,12 @@ int proc_dostring(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dobool(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3425,6 +3466,7 @@ int __init sysctl_init(void) * No sense putting this after each symbol definition, twice, * exception granted :-) */ +EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); From d02a3a2cb25d384005a6e3446a445013342024b7 Mon Sep 17 00:00:00 2001 From: Jia He Date: Tue, 3 Aug 2021 12:59:37 +0200 Subject: [PATCH 14/31] lockd: change the proc_handler for nsm_use_hostnames nsm_use_hostnames is a module parameter and it will be exported to sysctl procfs. This is to let user sometimes change it from userspace. But the minimal unit for sysctl procfs read/write it sizeof(int). In big endian system, the converting from/to bool to/from int will cause error for proc items. This patch use a new proc_handler proc_dobool to fix it. Signed-off-by: Jia He Reviewed-by: Pan Xinhui [thuth: Fix typo in commit message] Signed-off-by: Thomas Huth Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2de048f80eb8..0ab9756ed235 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -584,7 +584,7 @@ static struct ctl_table nlm_sysctls[] = { .data = &nsm_use_hostnames, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", From b4ab2fea7c797b0b8b92332c7e315703c12d37d8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 30 Jul 2021 16:07:36 -0400 Subject: [PATCH 15/31] SUNRPC: Add RPC_AUTH_TLS protocol numbers Shared by client and server. See: https://www.iana.org/assignments/rpc-authentication-numbers/rpc-authentication-numbers.xhtml Signed-off-by: Chuck Lever --- include/linux/sunrpc/msg_prot.h | 1 + include/linux/sunrpc/xdr.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 938c2bf29db8..02117ed0fa2e 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -20,6 +20,7 @@ enum rpc_auth_flavors { RPC_AUTH_DES = 3, RPC_AUTH_KRB = 4, RPC_AUTH_GSS = 6, + RPC_AUTH_TLS = 7, RPC_AUTH_MAXFLAVOR = 8, /* pseudoflavors: */ RPC_AUTH_GSS_KRB5 = 390003, diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index a965cbc136ad..b519609af1d0 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -95,6 +95,7 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len) #define rpc_auth_unix cpu_to_be32(RPC_AUTH_UNIX) #define rpc_auth_short cpu_to_be32(RPC_AUTH_SHORT) #define rpc_auth_gss cpu_to_be32(RPC_AUTH_GSS) +#define rpc_auth_tls cpu_to_be32(RPC_AUTH_TLS) #define rpc_call cpu_to_be32(RPC_CALL) #define rpc_reply cpu_to_be32(RPC_REPLY) From 5a4753446253a427c0ff1e433b9c4933e5af207c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 12 Aug 2021 16:41:42 -0400 Subject: [PATCH 16/31] rpc: fix gss_svc_init cleanup on failure The failure case here should be rare, but it's obviously wrong. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index a81be45f40d9..3d685fe328fa 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1980,7 +1980,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; From f7104cc1a9159cd0d3e8526cb638ae0301de4b61 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 12 Aug 2021 16:41:43 -0400 Subject: [PATCH 17/31] nfsd4: Fix forced-expiry locking This should use the network-namespace-wide client_lock, not the per-client cl_lock. You shouldn't see any bugs unless you're actually using the forced-expiry interface introduced by 89c905beccbb. Fixes: 89c905beccbb "nfsd: allow forced expiration of NFSv4 clients" Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fa67ecd5fe63..2bedc7839ec5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2687,9 +2687,9 @@ static void force_expire_client(struct nfs4_client *clp) trace_nfsd_clid_admin_expired(&clp->cl_clientid); - spin_lock(&clp->cl_lock); + spin_lock(&nn->client_lock); clp->cl_time = 0; - spin_unlock(&clp->cl_lock); + spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); From 729580ddc53efd8093371788721487024c9b2f71 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 18 Aug 2021 17:12:26 -0400 Subject: [PATCH 18/31] svcrdma: xpt_bc_xprt is already clear in __svc_rdma_free() svc_xprt_free() already "puts" the bc_xprt before calling the transport's "free" method. No need to do it twice. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d1faa522c3dd..94b20fb47135 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -545,7 +545,6 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - struct svc_xprt *xprt = &rdma->sc_xprt; /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -553,12 +552,6 @@ static void __svc_rdma_free(struct work_struct *work) svc_rdma_flush_recv_queues(rdma); - /* Final put of backchannel client transport */ - if (xprt->xpt_bc_xprt) { - xprt_put(xprt->xpt_bc_xprt); - xprt->xpt_bc_xprt = NULL; - } - svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); From c782af250083f69ba810e79b60a552252e777416 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 3 Aug 2021 15:45:18 -0400 Subject: [PATCH 19/31] SUNRPC: Add a /sys/kernel/debug/fail_sunrpc/ directory This directory will contain a set of administrative controls for enabling error injection for kernel RPC consumers. Signed-off-by: Chuck Lever --- lib/Kconfig.debug | 7 +++++++ net/sunrpc/debugfs.c | 14 ++++++++++++++ net/sunrpc/fail.h | 21 +++++++++++++++++++++ 3 files changed, 42 insertions(+) create mode 100644 net/sunrpc/fail.h diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5ddd575159fb..cd78bb0a7dd9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1971,6 +1971,13 @@ config FAIL_MMC_REQUEST and to test how the mmc host driver handles retries from the block device. +config FAIL_SUNRPC + bool "Fault-injection capability for SunRPC" + depends on FAULT_INJECTION_DEBUG_FS && SUNRPC_DEBUG + help + Provide fault-injection capability for SunRPC and + its consumers. + config FAULT_INJECTION_STACKTRACE_FILTER bool "stacktrace filter for fault-injection capabilities" depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 56029e3af6ff..eaeb51f83abd 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -8,7 +8,9 @@ #include #include #include + #include "netns.h" +#include "fail.h" static struct dentry *topdir; static struct dentry *rpc_clnt_dir; @@ -297,6 +299,13 @@ static const struct file_operations fault_disconnect_fops = { .release = fault_release, }; +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +struct fail_sunrpc_attr fail_sunrpc = { + .attr = FAULT_ATTR_INITIALIZER, +}; +EXPORT_SYMBOL_GPL(fail_sunrpc); +#endif + void __exit sunrpc_debugfs_exit(void) { @@ -321,4 +330,9 @@ sunrpc_debugfs_init(void) debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL, &fault_disconnect_fops); + +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) + fault_create_debugfs_attr("fail_sunrpc", NULL, + &fail_sunrpc.attr); +#endif } diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h new file mode 100644 index 000000000000..1d402b0d3453 --- /dev/null +++ b/net/sunrpc/fail.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021, Oracle. All rights reserved. + */ + +#ifndef _NET_SUNRPC_FAIL_H_ +#define _NET_SUNRPC_FAIL_H_ + +#include + +#if IS_ENABLED(CONFIG_FAULT_INJECTION) + +struct fail_sunrpc_attr { + struct fault_attr attr; +}; + +extern struct fail_sunrpc_attr fail_sunrpc; + +#endif /* CONFIG_FAULT_INJECTION */ + +#endif /* _NET_SUNRPC_FAIL_H_ */ From a4ae308143961bf688e1c8a62f6604e62b491120 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 5 Aug 2021 10:25:49 -0400 Subject: [PATCH 20/31] SUNRPC: Move client-side disconnect injection Disconnect injection stress-tests the ability for both client and server implementations to behave resiliently in the face of network instability. Convert the existing client-side disconnect injection infrastructure to use the kernel's generic error injection facility. The generic facility has a richer set of injection criteria. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xprt.h | 18 -------- net/sunrpc/debugfs.c | 82 ++++++++----------------------------- net/sunrpc/fail.h | 2 + net/sunrpc/xprt.c | 14 +++++++ 4 files changed, 32 insertions(+), 84 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c8c39f22d3b1..b15c1f07162d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -288,7 +288,6 @@ struct rpc_xprt { const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *debugfs; /* debugfs directory */ - atomic_t inject_disconnect; #endif struct rcu_head rcu; const struct xprt_class *xprt_class; @@ -502,21 +501,4 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) return test_and_set_bit(XPRT_BINDING, &xprt->state); } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -extern unsigned int rpc_inject_disconnect; -static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) -{ - if (!rpc_inject_disconnect) - return; - if (atomic_dec_return(&xprt->inject_disconnect)) - return; - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); - xprt->ops->inject_disconnect(xprt); -} -#else -static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) -{ -} -#endif - #endif /* _LINUX_SUNRPC_XPRT_H */ diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index eaeb51f83abd..04e453ad3508 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -16,8 +16,6 @@ static struct dentry *topdir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; -unsigned int rpc_inject_disconnect; - static int tasks_show(struct seq_file *f, void *v) { @@ -237,8 +235,6 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) /* make tasks file */ debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops); - - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -248,62 +244,26 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } -static int -fault_open(struct inode *inode, struct file *filp) -{ - filp->private_data = kmalloc(128, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - return 0; -} - -static int -fault_release(struct inode *inode, struct file *filp) -{ - kfree(filp->private_data); - return 0; -} - -static ssize_t -fault_disconnect_read(struct file *filp, char __user *user_buf, - size_t len, loff_t *offset) -{ - char *buffer = (char *)filp->private_data; - size_t size; - - size = sprintf(buffer, "%u\n", rpc_inject_disconnect); - return simple_read_from_buffer(user_buf, len, offset, buffer, size); -} - -static ssize_t -fault_disconnect_write(struct file *filp, const char __user *user_buf, - size_t len, loff_t *offset) -{ - char buffer[16]; - - if (len >= sizeof(buffer)) - len = sizeof(buffer) - 1; - if (copy_from_user(buffer, user_buf, len)) - return -EFAULT; - buffer[len] = '\0'; - if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) - return -EINVAL; - return len; -} - -static const struct file_operations fault_disconnect_fops = { - .owner = THIS_MODULE, - .open = fault_open, - .read = fault_disconnect_read, - .write = fault_disconnect_write, - .release = fault_release, -}; - #if IS_ENABLED(CONFIG_FAIL_SUNRPC) struct fail_sunrpc_attr fail_sunrpc = { .attr = FAULT_ATTR_INITIALIZER, }; EXPORT_SYMBOL_GPL(fail_sunrpc); + +static void fail_sunrpc_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_sunrpc", NULL, + &fail_sunrpc.attr); + + debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_client_disconnect); +} +#else +static void fail_sunrpc_init(void) +{ +} #endif void __exit @@ -318,21 +278,11 @@ sunrpc_debugfs_exit(void) void __init sunrpc_debugfs_init(void) { - struct dentry *rpc_fault_dir; - topdir = debugfs_create_dir("sunrpc", NULL); rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir); - rpc_fault_dir = debugfs_create_dir("inject_fault", topdir); - - debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL, - &fault_disconnect_fops); - -#if IS_ENABLED(CONFIG_FAIL_SUNRPC) - fault_create_debugfs_attr("fail_sunrpc", NULL, - &fail_sunrpc.attr); -#endif + fail_sunrpc_init(); } diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h index 1d402b0d3453..62c1b9fd59e2 100644 --- a/net/sunrpc/fail.h +++ b/net/sunrpc/fail.h @@ -12,6 +12,8 @@ struct fail_sunrpc_attr { struct fault_attr attr; + + bool ignore_client_disconnect; }; extern struct fail_sunrpc_attr fail_sunrpc; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index fb6db09725c7..05abe344a269 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -56,6 +56,7 @@ #include "sunrpc.h" #include "sysfs.h" +#include "fail.h" /* * Local variables @@ -855,6 +856,19 @@ xprt_init_autodisconnect(struct timer_list *t) queue_work(xprtiod_workqueue, &xprt->task_cleanup); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ + if (!fail_sunrpc.ignore_client_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + xprt->ops->inject_disconnect(xprt); +} +#else +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ +} +#endif + bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) From 3a1261805940d0ff1dbbb9c705dddbc018c0423f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 3 Aug 2021 15:55:58 -0400 Subject: [PATCH 21/31] SUNRPC: Server-side disconnect injection Disconnect injection stress-tests the ability for both client and server implementations to behave resiliently in the face of network instability. A file called /sys/kernel/debug/fail_sunrpc/ignore-server-disconnect enables administrators to turn off server-side disconnect injection while allowing other types of sunrpc errors to be injected. The default setting is that server-side disconnect injection is enabled (ignore=false). Signed-off-by: Chuck Lever --- net/sunrpc/debugfs.c | 3 +++ net/sunrpc/fail.h | 2 ++ net/sunrpc/svc.c | 8 ++++++++ 3 files changed, 13 insertions(+) diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 04e453ad3508..827bf3a28178 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -259,6 +259,9 @@ static void fail_sunrpc_init(void) debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir, &fail_sunrpc.ignore_client_disconnect); + + debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_server_disconnect); } #else static void fail_sunrpc_init(void) diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h index 62c1b9fd59e2..69dc30cc44b8 100644 --- a/net/sunrpc/fail.h +++ b/net/sunrpc/fail.h @@ -14,6 +14,8 @@ struct fail_sunrpc_attr { struct fault_attr attr; bool ignore_client_disconnect; + + bool ignore_server_disconnect; }; extern struct fail_sunrpc_attr fail_sunrpc; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 5aa263326b6a..bfcbaf7b3822 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -31,6 +31,8 @@ #include +#include "fail.h" + #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); @@ -1524,6 +1526,12 @@ svc_process(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; u32 dir; +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) + if (!fail_sunrpc.ignore_server_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + svc_xprt_deferred_close(rqstp->rq_xprt); +#endif + /* * Setup response xdr_buf. * Initially it has just one page From 400edd8c0455b9de91d079a4141ff20ba2d221f2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 9 Aug 2021 14:01:50 -0400 Subject: [PATCH 22/31] SUNRPC: Add documentation for the fail_sunrpc/ directory Signed-off-by: Chuck Lever --- .../fault-injection/fault-injection.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index f47d05ed0d94..4a25c5eb6f07 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -24,6 +24,10 @@ Available fault injection capabilities injects futex deadlock and uaddr fault errors. +- fail_sunrpc + + injects kernel RPC client and server failures. + - fail_make_request injects disk IO errors on devices permitted by setting @@ -151,6 +155,20 @@ configuration of fault-injection capabilities. default is 'N', setting it to 'Y' will disable failure injections when dealing with private (address space) futexes. +- /sys/kernel/debug/fail_sunrpc/ignore-client-disconnect: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will disable disconnect + injection on the RPC client. + +- /sys/kernel/debug/fail_sunrpc/ignore-server-disconnect: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will disable disconnect + injection on the RPC server. + - /sys/kernel/debug/fail_function/inject: Format: { 'function-name' | '!function-name' | '' } From 7de875b231edb807387a81cde288aa9e1015ef9e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:01:59 -0400 Subject: [PATCH 23/31] lockd: lockd server-side shouldn't set fl_ops Locks have two sets of op arrays, fl_lmops for the lock manager (lockd or nfsd), fl_ops for the filesystem. The server-side lockd code has been setting its own fl_ops, which leads to confusion (and crashes) in the reexport case, where the filesystem expects to be the only one setting fl_ops. And there's no reason for it that I can see-the lm_get/put_owner ops do the same job. Reported-by: Daire Byrne Tested-by: Daire Byrne Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 498cb70c2c0d..273a81971ed5 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -395,28 +395,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock) nlmsvc_put_lockowner(lock->fl.fl_owner); } -static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner; - new->fl_owner = nlmsvc_get_lockowner(nlm_lo); -} - -static void nlmsvc_locks_release_private(struct file_lock *fl) -{ - nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner); -} - -static const struct file_lock_operations nlmsvc_lock_ops = { - .fl_copy_lock = nlmsvc_locks_copy_lock, - .fl_release_private = nlmsvc_locks_release_private, -}; - void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { fl->fl_owner = nlmsvc_find_lockowner(host, pid); - if (fl->fl_owner != NULL) - fl->fl_ops = &nlmsvc_lock_ops; } /* @@ -788,9 +770,21 @@ nlmsvc_notify_blocked(struct file_lock *fl) printk(KERN_WARNING "lockd: notification for unknown block!\n"); } +static fl_owner_t nlmsvc_get_owner(fl_owner_t owner) +{ + return nlmsvc_get_lockowner(owner); +} + +static void nlmsvc_put_owner(fl_owner_t owner) +{ + nlmsvc_put_lockowner(owner); +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, + .lm_get_owner = nlmsvc_get_owner, + .lm_put_owner = nlmsvc_put_owner, }; /* From 2dc6f19e4f438d4c14987cb17aee38aaf7304e7f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 23 Aug 2021 12:01:18 -0400 Subject: [PATCH 24/31] nlm: minor nlm_lookup_file argument change It'll come in handy to get the whole nlm_lock. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 3 ++- fs/lockd/svcproc.c | 2 +- fs/lockd/svcsubs.c | 15 ++++++++------- include/linux/lockd/lockd.h | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4c10fb5138f1..bc496bbd696b 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -40,7 +40,8 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + error = nlm_lookup_file(rqstp, &file, lock); + if (error) goto no_locks; *filp = file; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 4ae4b63b5392..f4e5e0eb30fd 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -69,7 +69,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh)); + error = cast_status(nlm_lookup_file(rqstp, &file, lock)); if (error != 0) goto no_locks; *filp = file; diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 028fc152da22..2d62633b39e5 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -82,31 +82,31 @@ static inline unsigned int file_hash(struct nfs_fh *f) */ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, - struct nfs_fh *f) + struct nlm_lock *lock) { struct nlm_file *file; unsigned int hash; __be32 nfserr; - nlm_debug_print_fh("nlm_lookup_file", f); + nlm_debug_print_fh("nlm_lookup_file", &lock->fh); - hash = file_hash(f); + hash = file_hash(&lock->fh); /* Lock file table */ mutex_lock(&nlm_file_mutex); hlist_for_each_entry(file, &nlm_files[hash], f_list) - if (!nfs_compare_fh(&file->f_handle, f)) + if (!nfs_compare_fh(&file->f_handle, &lock->fh)) goto found; - nlm_debug_print_fh("creating file for", f); + nlm_debug_print_fh("creating file for", &lock->fh); nfserr = nlm_lck_denied_nolocks; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out_unlock; - memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); + memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh)); mutex_init(&file->f_mutex); INIT_HLIST_NODE(&file->f_list); INIT_LIST_HEAD(&file->f_blocks); @@ -117,7 +117,8 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, * We have to make sure we have the right credential to open * the file. */ - if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { + nfserr = nlmsvc_ops->fopen(rqstp, &lock->fh, &file->f_file); + if (nfserr) { dprintk("lockd: open failed (error %d)\n", nfserr); goto out_free; } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 666f5f310a04..81b71ad2040a 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -286,7 +286,7 @@ void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); * File handling for the server personality */ __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, - struct nfs_fh *); + struct nlm_lock *); void nlm_release_file(struct nlm_file *); void nlmsvc_release_lockowner(struct nlm_lock *); void nlmsvc_mark_resources(struct net *); From a81041b7d8f08c4e1014173c5483a0f18724a576 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 23 Aug 2021 11:26:39 -0400 Subject: [PATCH 25/31] nlm: minor refactoring Make this lookup slightly more concise, and prepare for changing how we look this up in a following patch. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 16 ++++++++-------- fs/lockd/svcsubs.c | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 273a81971ed5..bcd180ba9957 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -474,8 +474,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, @@ -581,8 +581,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lockowner *test_owner; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -644,8 +644,8 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) int error; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -673,8 +673,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l int status = 0; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 2d62633b39e5..e02951f8a28f 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = locks_inode(file->f_file); + struct inode *inode = nlmsvc_file_inode(file); dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); @@ -416,7 +416,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == locks_inode(file->f_file)->i_sb; + return sb == nlmsvc_file_inode(file)->i_sb; } /** From b661601a9fdf1af8516e1100de8bba84bd41cca4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:02:02 -0400 Subject: [PATCH 26/31] lockd: update nlm_lookup_file reexport comment Update comment to reflect that we *do* allow reexport, whether it's a good idea or not.... Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svcsubs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index e02951f8a28f..13e6ffc219ec 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -111,8 +111,9 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, INIT_HLIST_NODE(&file->f_list); INIT_LIST_HEAD(&file->f_blocks); - /* Open the file. Note that this must not sleep for too long, else - * we would lock up lockd:-) So no NFS re-exports, folks. + /* + * Open the file. Note that if we're reexporting, for example, + * this could block the lockd thread for a while. * * We have to make sure we have the right credential to open * the file. From 7f024fcd5c97dc70bb9121c80407cf3cf9be7159 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 23 Aug 2021 16:44:00 -0400 Subject: [PATCH 27/31] Keep read and write fds with each nlm_file We shouldn't really be using a read-only file descriptor to take a write lock. Most filesystems will put up with it. But NFS, for example, won't. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 4 +- fs/lockd/svclock.c | 25 ++++++--- fs/lockd/svcproc.c | 4 +- fs/lockd/svcsubs.c | 102 +++++++++++++++++++++++++----------- fs/nfsd/lockd.c | 8 ++- include/linux/lockd/bind.h | 3 +- include/linux/lockd/lockd.h | 9 +++- 7 files changed, 111 insertions(+), 44 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index bc496bbd696b..e10ae2c41279 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -40,13 +40,15 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { + int mode = lock_to_openmode(&lock->fl); + error = nlm_lookup_file(rqstp, &file, lock); if (error) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index bcd180ba9957..a7b4c51667ad 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -471,6 +471,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, { struct nlm_block *block = NULL; int error; + int mode; __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", @@ -524,7 +525,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); @@ -577,6 +579,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *conflock, struct nlm_cookie *cookie) { int error; + int mode; __be32 ret; struct nlm_lockowner *test_owner; @@ -595,7 +598,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, /* If there's a conflicting lock, remember to clean up the test lock */ test_owner = (struct nlm_lockowner *)lock->fl.fl_owner; - error = vfs_test_lock(file->f_file, &lock->fl); + mode = lock_to_openmode(&lock->fl); + error = vfs_test_lock(file->f_file[mode], &lock->fl); if (error) { /* We can't currently deal with deferred test requests */ if (error == FILE_LOCK_DEFERRED) @@ -641,7 +645,7 @@ out: __be32 nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { - int error; + int error = 0; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, @@ -654,7 +658,12 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + if (file->f_file[O_RDONLY]) + error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + &lock->fl, NULL); + if (file->f_file[O_WRONLY]) + error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; } @@ -671,6 +680,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l { struct nlm_block *block; int status = 0; + int mode; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, @@ -686,7 +696,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - vfs_cancel_lock(block->b_file->f_file, + mode = lock_to_openmode(&lock->fl); + vfs_cancel_lock(block->b_file->f_file[mode], &block->b_call->a_args.lock.fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); @@ -803,6 +814,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) { struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; + int mode; int error; loff_t fl_start, fl_end; @@ -828,7 +840,8 @@ nlmsvc_grant_blocked(struct nlm_block *block) lock->fl.fl_flags |= FL_SLEEP; fl_start = lock->fl.fl_start; fl_end = lock->fl.fl_end; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; lock->fl.fl_start = fl_start; lock->fl.fl_end = fl_end; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index f4e5e0eb30fd..99696d3f6dd6 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -55,6 +55,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, struct nlm_host *host = NULL; struct nlm_file *file = NULL; struct nlm_lock *lock = &argp->lock; + int mode; __be32 error = 0; /* nfsd callbacks must have been installed for this procedure */ @@ -75,7 +76,8 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + mode = lock_to_openmode(&lock->fl); + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 13e6ffc219ec..cb3a7512c33e 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -71,14 +71,35 @@ static inline unsigned int file_hash(struct nfs_fh *f) return tmp & (FILE_NRHASH - 1); } +int lock_to_openmode(struct file_lock *lock) +{ + return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY; +} + +/* + * Open the file. Note that if we're reexporting, for example, + * this could block the lockd thread for a while. + * + * We have to make sure we have the right credential to open + * the file. + */ +static __be32 nlm_do_fopen(struct svc_rqst *rqstp, + struct nlm_file *file, int mode) +{ + struct file **fp = &file->f_file[mode]; + __be32 nfserr; + + if (*fp) + return 0; + nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (nfserr) + dprintk("lockd: open failed (error %d)\n", nfserr); + return nfserr; +} + /* * Lookup file info. If it doesn't exist, create a file info struct * and open a (VFS) file for the given inode. - * - * FIXME: - * Note that we open the file O_RDONLY even when creating write locks. - * This is not quite right, but for now, we assume the client performs - * the proper R/W checking. */ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, @@ -87,42 +108,38 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, struct nlm_file *file; unsigned int hash; __be32 nfserr; + int mode; nlm_debug_print_fh("nlm_lookup_file", &lock->fh); hash = file_hash(&lock->fh); + mode = lock_to_openmode(&lock->fl); /* Lock file table */ mutex_lock(&nlm_file_mutex); hlist_for_each_entry(file, &nlm_files[hash], f_list) - if (!nfs_compare_fh(&file->f_handle, &lock->fh)) + if (!nfs_compare_fh(&file->f_handle, &lock->fh)) { + mutex_lock(&file->f_mutex); + nfserr = nlm_do_fopen(rqstp, file, mode); + mutex_unlock(&file->f_mutex); goto found; - + } nlm_debug_print_fh("creating file for", &lock->fh); nfserr = nlm_lck_denied_nolocks; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) - goto out_unlock; + goto out_free; memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh)); mutex_init(&file->f_mutex); INIT_HLIST_NODE(&file->f_list); INIT_LIST_HEAD(&file->f_blocks); - /* - * Open the file. Note that if we're reexporting, for example, - * this could block the lockd thread for a while. - * - * We have to make sure we have the right credential to open - * the file. - */ - nfserr = nlmsvc_ops->fopen(rqstp, &lock->fh, &file->f_file); - if (nfserr) { - dprintk("lockd: open failed (error %d)\n", nfserr); - goto out_free; - } + nfserr = nlm_do_fopen(rqstp, file, mode); + if (nfserr) + goto out_unlock; hlist_add_head(&file->f_list, &nlm_files[hash]); @@ -130,7 +147,6 @@ found: dprintk("lockd: found file %p (count %d)\n", file, file->f_count); *result = file; file->f_count++; - nfserr = 0; out_unlock: mutex_unlock(&nlm_file_mutex); @@ -150,13 +166,34 @@ nlm_delete_file(struct nlm_file *file) nlm_debug_print_file("closing file", file); if (!hlist_unhashed(&file->f_list)) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); kfree(file); } else { printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); } } +static int nlm_unlock_files(struct nlm_file *file) +{ + struct file_lock lock; + struct file *f; + + lock.fl_type = F_UNLCK; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + for (f = file->f_file[0]; f <= file->f_file[1]; f++) { + if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { + pr_warn("lockd: unlock failure in %s:%d\n", + __FILE__, __LINE__); + return 1; + } + } + return 0; +} + /* * Loop over all locks on the given file and perform the specified * action. @@ -184,17 +221,10 @@ again: lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host; if (match(lockhost, host)) { - struct file_lock lock = *fl; spin_unlock(&flctx->flc_lock); - lock.fl_type = F_UNLCK; - lock.fl_start = 0; - lock.fl_end = OFFSET_MAX; - if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) { - printk("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); + if (nlm_unlock_files(file)) return 1; - } goto again; } } @@ -248,6 +278,15 @@ nlm_file_inuse(struct nlm_file *file) return 0; } +static void nlm_close_files(struct nlm_file *file) +{ + struct file *f; + + for (f = file->f_file[0]; f <= file->f_file[1]; f++) + if (f) + nlmsvc_ops->fclose(f); +} + /* * Loop over all files in the file table. */ @@ -278,7 +317,7 @@ nlm_traverse_files(void *data, nlm_host_match_fn_t match, if (list_empty(&file->f_blocks) && !file->f_locks && !file->f_shares && !file->f_count) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + nlm_close_files(file); kfree(file); } } @@ -412,6 +451,7 @@ nlmsvc_invalidate_all(void) nlm_traverse_files(NULL, nlmsvc_is_client, NULL); } + static int nlmsvc_match_sb(void *datap, struct nlm_file *file) { diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 3f5b3d7b62b7..606fa155c28a 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -25,9 +25,11 @@ * Note: we hold the dentry use count while the file is open. */ static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, + int mode) { __be32 nfserr; + int access; struct svc_fh fh; /* must initialize before using! but maxsize doesn't matter */ @@ -36,7 +38,9 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); fh.fh_export = NULL; - nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access |= NFSD_MAY_LOCK; + nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 0520c0cd73f4..3bc9f7410e21 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -27,7 +27,8 @@ struct rpc_task; struct nlmsvc_binding { __be32 (*fopen)(struct svc_rqst *, struct nfs_fh *, - struct file **); + struct file **, + int mode); void (*fclose)(struct file *); }; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 81b71ad2040a..c4ae6506b8b3 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -10,6 +10,8 @@ #ifndef LINUX_LOCKD_LOCKD_H #define LINUX_LOCKD_LOCKD_H +/* XXX: a lot of this should really be under fs/lockd. */ + #include #include #include @@ -154,7 +156,8 @@ struct nlm_rqst { struct nlm_file { struct hlist_node f_list; /* linked list */ struct nfs_fh f_handle; /* NFS file handle */ - struct file * f_file; /* VFS file pointer */ + struct file * f_file[2]; /* VFS file pointers, + indexed by O_ flags */ struct nlm_share * f_shares; /* DOS shares */ struct list_head f_blocks; /* blocked locks */ unsigned int f_locks; /* guesstimate # of locks */ @@ -267,6 +270,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref); /* * Server-side lock handling */ +int lock_to_openmode(struct file_lock *); __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *, struct nlm_host *, struct nlm_lock *, int, struct nlm_cookie *, int); @@ -301,7 +305,8 @@ int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) { - return locks_inode(file->f_file); + return locks_inode(file->f_file[O_RDONLY] ? + file->f_file[O_RDONLY] : file->f_file[O_WRONLY]); } static inline int __nlm_privileged_request4(const struct sockaddr *sap) From f657f8eef3ff870552c9fd2839e0061046f44618 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:02:04 -0400 Subject: [PATCH 28/31] nfs: don't atempt blocking locks on nfs reexports NFS implements blocking locks by blocking inside its lock method. In the reexport case, this blocks the nfs server thread, which could lead to deadlocks since an nfs server thread might be required to unlock the conflicting lock. It also causes a crash, since the nfs server thread assumes it can free the lock when its lm_notify lock callback is called. Ideal would be to make the nfs lock method return without blocking in this case, but for now it works just not to attempt blocking locks. The difference is just that the original client will have to poll (as it does in the v4.0 case) instead of getting a callback when the lock's available. Signed-off-by: J. Bruce Fields Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- fs/nfs/export.c | 2 +- fs/nfsd/nfs4state.c | 8 ++++++-- include/linux/exportfs.h | 2 ++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 37a1a88df771..d772c20bbfd1 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -180,5 +180,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR, + EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2bedc7839ec5..d0b2041c4d75 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6835,6 +6835,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; + struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6856,6 +6857,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } + sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6904,7 +6906,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6916,7 +6919,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index fe848901fcc3..3260fe714846 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -221,6 +221,8 @@ struct export_operations { #define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply atomic attribute updates */ +#define EXPORT_OP_SYNC_LOCKS (0x20) /* Filesystem can't do + asychronous blocking locks */ unsigned long flags; }; From b840be2f00c0bc00d993f8f76e251052b83e4382 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:02:05 -0400 Subject: [PATCH 29/31] lockd: don't attempt blocking locks on nfs reexports As in the v4 case, it doesn't work well to block waiting for a lock on an nfs filesystem. As in the v4 case, that means we're depending on the client to poll. It's probably incorrect to depend on that, but I *think* clients do poll in practice. In any case, it's an improvement over hanging the lockd thread indefinitely as we currently are. Signed-off-by: J. Bruce Fields Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index a7b4c51667ad..e9b85d8fd5fe 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -31,6 +31,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -470,18 +471,24 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; + struct inode *inode = nlmsvc_file_inode(file); int error; int mode; + int async_block = 0; __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - nlmsvc_file_inode(file)->i_sb->s_id, - nlmsvc_file_inode(file)->i_ino, + inode->i_sb->s_id, inode->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, wait); + if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + async_block = wait; + wait = 0; + } + /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) @@ -542,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ if (wait) break; - ret = nlm_lck_denied; + ret = async_block ? nlm_lck_blocked : nlm_lck_denied; goto out; case FILE_LOCK_DEFERRED: if (wait) From bb0a55bb7148a49e549ee992200860e7a040d3a5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2021 17:02:06 -0400 Subject: [PATCH 30/31] nfs: don't allow reexport reclaims In the reexport case, nfsd is currently passing along locks with the reclaim bit set. The client sends a new lock request, which is granted if there's currently no conflict--even if it's possible a conflicting lock could have been briefly held in the interim. We don't currently have any way to safely grant reclaim, so for now let's just deny them all. I'm doing this by passing the reclaim bit to nfs and letting it fail the call, with the idea that eventually the client might be able to do something more forgiving here. Signed-off-by: J. Bruce Fields Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- fs/nfs/file.c | 3 +++ fs/nfsd/nfs4state.c | 3 +++ fs/nfsd/nfsproc.c | 1 + include/linux/errno.h | 1 + include/linux/fs.h | 1 + 5 files changed, 9 insertions(+) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1fef107961bc..7411658f8b05 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -806,6 +806,9 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) nfs_inc_stats(inode, NFSIOS_VFSLOCK); + if (fl->fl_flags & FL_RECLAIM) + return -ENOGRACE; + /* No mandatory locks over NFS */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) goto out_err; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d0b2041c4d75..1b6a7f48982e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6903,6 +6903,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; + if (lock->lk_reclaim) + fl_flags |= FL_RECLAIM; + fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 60d7c59e7935..90fcd6178823 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -881,6 +881,7 @@ nfserrno (int errno) { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, }; int i; diff --git a/include/linux/errno.h b/include/linux/errno.h index d73f597a2484..8b0c754bab02 100644 --- a/include/linux/errno.h +++ b/include/linux/errno.h @@ -31,5 +31,6 @@ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */ #define ERECALLCONFLICT 530 /* conflict with recalled state */ +#define ENOGRACE 531 /* NFS file lock reclaim refused */ #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 640574294216..1f5c3dbce1da 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -997,6 +997,7 @@ static inline struct file *get_file(struct file *f) #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) From 0bcc7ca40bd823193224e9f38bafbd8325aaf566 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 24 Aug 2021 22:36:03 -0400 Subject: [PATCH 31/31] nfsd: fix crash on LOCKT on reexported NFSv3 Unlike other filesystems, NFSv3 tries to use fl_file in the GETLK case. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1b6a7f48982e..4b6d60b46b0a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7043,8 +7043,7 @@ out: /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to - * vfs_test_lock. (Arguably perhaps test_lock should be done with an - * inode operation.) + * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { @@ -7059,7 +7058,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct NFSD_MAY_READ)); if (err) goto out; + lock->fl_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + lock->fl_file = NULL; out: fh_unlock(fhp); nfsd_file_put(nf);