SUNRPC: Handle TCP socket sends with kernel_sendpage() again

Daire Byrne reports a ~50% aggregrate throughput regression on his
Linux NFS server after commit da1661b93b ("SUNRPC: Teach server to
use xprt_sock_sendmsg for socket sends"), which replaced
kernel_send_page() calls in NFSD's socket send path with calls to
sock_sendmsg() using iov_iter.

Investigation showed that tcp_sendmsg() was not using zero-copy to
send the xdr_buf's bvec pages, but instead was relying on memcpy.
This means copying every byte of a large NFS READ payload.

It looks like TLS sockets do indeed support a ->sendpage method,
so it's really not necessary to use xprt_sock_sendmsg() to support
TLS fully on the server. A mechanical reversion of da1661b93b is
not possible at this point, but we can re-implement the server's
TCP socket sendmsg path using kernel_sendpage().

Reported-by: Daire Byrne <daire@dneg.com>
BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
This commit is contained in:
Chuck Lever 2020-12-18 12:28:41 -05:00
parent d6c9e4368c
commit 4a85a6a332

View File

@ -1062,6 +1062,90 @@ err_noclose:
return 0; /* record not complete */
}
static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
int flags)
{
return kernel_sendpage(sock, virt_to_page(vec->iov_base),
offset_in_page(vec->iov_base),
vec->iov_len, flags);
}
/*
* kernel_sendpage() is used exclusively to reduce the number of
* copy operations in this path. Therefore the caller must ensure
* that the pages backing @xdr are unchanging.
*
* In addition, the logic assumes that * .bv_len is never larger
* than PAGE_SIZE.
*/
static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
struct xdr_buf *xdr, rpc_fraghdr marker,
unsigned int *sentp)
{
const struct kvec *head = xdr->head;
const struct kvec *tail = xdr->tail;
struct kvec rm = {
.iov_base = &marker,
.iov_len = sizeof(marker),
};
int flags, ret;
*sentp = 0;
xdr_alloc_bvec(xdr, GFP_KERNEL);
msg->msg_flags = MSG_MORE;
ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
if (ret < 0)
return ret;
*sentp += ret;
if (ret != rm.iov_len)
return -EAGAIN;
flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0;
ret = svc_tcp_send_kvec(sock, head, flags);
if (ret < 0)
return ret;
*sentp += ret;
if (ret != head->iov_len)
goto out;
if (xdr->page_len) {
unsigned int offset, len, remaining;
struct bio_vec *bvec;
bvec = xdr->bvec;
offset = xdr->page_base;
remaining = xdr->page_len;
flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
while (remaining > 0) {
if (remaining <= PAGE_SIZE && tail->iov_len == 0)
flags = 0;
len = min(remaining, bvec->bv_len);
ret = kernel_sendpage(sock, bvec->bv_page,
bvec->bv_offset + offset,
len, flags);
if (ret < 0)
return ret;
*sentp += ret;
if (ret != len)
goto out;
remaining -= len;
offset = 0;
bvec++;
}
}
if (tail->iov_len) {
ret = svc_tcp_send_kvec(sock, tail, 0);
if (ret < 0)
return ret;
*sentp += ret;
}
out:
return 0;
}
/**
* svc_tcp_sendto - Send out a reply on a TCP socket
* @rqstp: completed svc_rqst
@ -1089,7 +1173,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
mutex_lock(&xprt->xpt_mutex);
if (svc_xprt_is_dead(xprt))
goto out_notconn;
err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
xdr_free_bvec(xdr);
trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
if (err < 0 || sent != (xdr->len + sizeof(marker)))