Merge branch 'vsock-virtio-vhost-zerocopy'

Arseniy Krasnov says:

====================
vsock/virtio/vhost: MSG_ZEROCOPY preparations

this patchset is first of three parts of another big patchset for
MSG_ZEROCOPY flag support:
https://lore.kernel.org/netdev/20230701063947.3422088-1-AVKrasnov@sberdevices.ru/

During review of this series, Stefano Garzarella <sgarzare@redhat.com>
suggested to split it for three parts to simplify review and merging:

1) virtio and vhost updates (for fragged skbs) <--- this patchset
2) AF_VSOCK updates (allows to enable MSG_ZEROCOPY mode and read
   tx completions) and update for Documentation/.
3) Updates for tests and utils.

This series enables handling of fragged skbs in virtio and vhost parts.
Newly logic won't be triggered, because SO_ZEROCOPY options is still
impossible to enable at this moment (next bunch of patches from big
set above will enable it).

I've included changelog to some patches anyway, because there were some
comments during review of last big patchset from the link above.

Head for this patchset is:
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit/?id=f2fa1c812c91e99d0317d1fc7d845e1e05f39716

Link to v1:
https://lore.kernel.org/netdev/20230717210051.856388-1-AVKrasnov@sberdevices.ru/
Link to v2:
https://lore.kernel.org/netdev/20230718180237.3248179-1-AVKrasnov@sberdevices.ru/
Link to v3:
https://lore.kernel.org/netdev/20230720214245.457298-1-AVKrasnov@sberdevices.ru/
Link to v4:
https://lore.kernel.org/netdev/20230727222627.1895355-1-AVKrasnov@sberdevices.ru/
Link to v5:
https://lore.kernel.org/netdev/20230730085905.3420811-1-AVKrasnov@sberdevices.ru/
Link to v6:
https://lore.kernel.org/netdev/20230814212720.3679058-1-AVKrasnov@sberdevices.ru/
Link to v7:
https://lore.kernel.org/netdev/20230827085436.941183-1-avkrasnov@salutedevices.com/
Link to v8:
https://lore.kernel.org/netdev/20230911202234.1932024-1-avkrasnov@salutedevices.com/

Changelog:
 v3 -> v4:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 v4 -> v5:
 * See per-patch changelog after ---.
 v5 -> v6:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
 v6 -> v7:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
 v7 -> v8:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
 v8 -> v9:
 * Patchset rebased and tested on new HEAD of net-next (see hash above).
 * See per-patch changelog after ---.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2023-10-15 13:19:43 +01:00
commit 4b714fd1a0
19 changed files with 1170 additions and 17 deletions

View File

@ -7,7 +7,8 @@ Intro
=====
The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
The feature is currently implemented for TCP and UDP sockets.
The feature is currently implemented for TCP, UDP and VSOCK (with
virtio transport) sockets.
Opportunity and Caveats
@ -174,7 +175,9 @@ read_notification() call in the previous snippet. A notification
is encoded in the standard error format, sock_extended_err.
The level and type fields in the control data are protocol family
specific, IP_RECVERR or IPV6_RECVERR.
specific, IP_RECVERR or IPV6_RECVERR (for TCP or UDP socket).
For VSOCK socket, cmsg_level will be SOL_VSOCK and cmsg_type will be
VSOCK_RECVERR.
Error origin is the new type SO_EE_ORIGIN_ZEROCOPY. ee_errno is zero,
as explained before, to avoid blocking read and write system calls on
@ -235,12 +238,15 @@ Implementation
Loopback
--------
For TCP and UDP:
Data sent to local sockets can be queued indefinitely if the receive
process does not read its socket. Unbound notification latency is not
acceptable. For this reason all packets generated with MSG_ZEROCOPY
that are looped to a local socket will incur a deferred copy. This
includes looping onto packet sockets (e.g., tcpdump) and tun devices.
For VSOCK:
Data path sent to local sockets is the same as for non-local sockets.
Testing
=======
@ -254,3 +260,6 @@ instance when run with msg_zerocopy.sh between a veth pair across
namespaces, the test will not show any improvement. For testing, the
loopback restriction can be temporarily relaxed by making
skb_orphan_frags_rx identical to skb_orphan_frags.
For VSOCK type of socket example can be found in
tools/testing/vsock/vsock_test_zerocopy.c.

View File

@ -398,6 +398,11 @@ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock)
return val < vq->num;
}
static bool vhost_transport_msgzerocopy_allow(void)
{
return true;
}
static bool vhost_transport_seqpacket_allow(u32 remote_cid);
static struct virtio_transport vhost_transport = {
@ -431,6 +436,8 @@ static struct virtio_transport vhost_transport = {
.seqpacket_allow = vhost_transport_seqpacket_allow,
.seqpacket_has_data = virtio_transport_seqpacket_has_data,
.msgzerocopy_allow = vhost_transport_msgzerocopy_allow,
.notify_poll_in = virtio_transport_notify_poll_in,
.notify_poll_out = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,

View File

@ -383,6 +383,7 @@ struct ucred {
#define SOL_MPTCP 284
#define SOL_MCTP 285
#define SOL_SMC 286
#define SOL_VSOCK 287
/* IPX options */
#define IPX_TYPE 1

View File

@ -177,6 +177,9 @@ struct vsock_transport {
/* Read a single skb */
int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
/* Zero-copy. */
bool (*msgzerocopy_allow)(void);
};
/**** CORE ****/
@ -241,4 +244,8 @@ static inline void __init vsock_bpf_build_proto(void)
{}
#endif
static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
{
return t->msgzerocopy_allow && t->msgzerocopy_allow();
}
#endif /* __AF_VSOCK_H__ */

View File

@ -191,4 +191,21 @@ struct sockaddr_vm {
#define IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9)
/* MSG_ZEROCOPY notifications are encoded in the standard error format,
* sock_extended_err. See Documentation/networking/msg_zerocopy.rst in
* kernel source tree for more details.
*/
/* 'cmsg_level' field value of 'struct cmsghdr' for notification parsing
* when MSG_ZEROCOPY flag is used on transmissions.
*/
#define SOL_VSOCK 287
/* 'cmsg_type' field value of 'struct cmsghdr' for notification parsing
* when MSG_ZEROCOPY flag is used on transmissions.
*/
#define VSOCK_RECVERR 1
#endif /* _UAPI_VM_SOCKETS_H */

View File

@ -89,6 +89,7 @@
#include <linux/types.h>
#include <linux/bitops.h>
#include <linux/cred.h>
#include <linux/errqueue.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/kernel.h>
@ -110,6 +111,7 @@
#include <linux/workqueue.h>
#include <net/sock.h>
#include <net/af_vsock.h>
#include <uapi/linux/vm_sockets.h>
static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
@ -1030,7 +1032,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
poll_wait(file, sk_sleep(sk), wait);
mask = 0;
if (sk->sk_err)
if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;
@ -1404,6 +1406,17 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr,
goto out;
}
if (vsock_msgzerocopy_allow(transport)) {
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
/* If this option was set before 'connect()',
* when transport was unknown, check that this
* feature is supported here.
*/
err = -EOPNOTSUPP;
goto out;
}
err = vsock_auto_bind(vsk);
if (err)
goto out;
@ -1558,6 +1571,9 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
} else {
newsock->state = SS_CONNECTED;
sock_graft(connected, newsock);
if (vsock_msgzerocopy_allow(vconnected->transport))
set_bit(SOCK_SUPPORT_ZC,
&connected->sk_socket->flags);
}
release_sock(connected);
@ -1635,7 +1651,7 @@ static int vsock_connectible_setsockopt(struct socket *sock,
const struct vsock_transport *transport;
u64 val;
if (level != AF_VSOCK)
if (level != AF_VSOCK && level != SOL_SOCKET)
return -ENOPROTOOPT;
#define COPY_IN(_v) \
@ -1658,6 +1674,33 @@ static int vsock_connectible_setsockopt(struct socket *sock,
transport = vsk->transport;
if (level == SOL_SOCKET) {
int zerocopy;
if (optname != SO_ZEROCOPY) {
release_sock(sk);
return sock_setsockopt(sock, level, optname, optval, optlen);
}
/* Use 'int' type here, because variable to
* set this option usually has this type.
*/
COPY_IN(zerocopy);
if (zerocopy < 0 || zerocopy > 1) {
err = -EINVAL;
goto exit;
}
if (transport && !vsock_msgzerocopy_allow(transport)) {
err = -EOPNOTSUPP;
goto exit;
}
sock_valbool_flag(sk, SOCK_ZEROCOPY, zerocopy);
goto exit;
}
switch (optname) {
case SO_VM_SOCKETS_BUFFER_SIZE:
COPY_IN(val);
@ -1822,6 +1865,12 @@ static int vsock_connectible_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
if (msg->msg_flags & MSG_ZEROCOPY &&
!vsock_msgzerocopy_allow(transport)) {
err = -EOPNOTSUPP;
goto out;
}
/* Wait for room in the produce queue to enqueue our user's data. */
timeout = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@ -2137,6 +2186,10 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int err;
sk = sock->sk;
if (unlikely(flags & MSG_ERRQUEUE))
return sock_recv_errqueue(sk, msg, len, SOL_VSOCK, VSOCK_RECVERR);
vsk = vsock_sk(sk);
err = 0;
@ -2304,6 +2357,12 @@ static int vsock_create(struct net *net, struct socket *sock,
}
}
/* SOCK_DGRAM doesn't have 'setsockopt' callback set in its
* proto_ops, so there is no handler for custom logic.
*/
if (sock_type_connectible(sock->type))
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
vsock_insert_unbound(vsk);
return 0;

View File

@ -486,6 +486,11 @@ static bool virtio_transport_can_msgzerocopy(int bufs_num)
return res;
}
static bool virtio_transport_msgzerocopy_allow(void)
{
return true;
}
static bool virtio_transport_seqpacket_allow(u32 remote_cid);
static struct virtio_transport virtio_transport = {
@ -519,6 +524,8 @@ static struct virtio_transport virtio_transport = {
.seqpacket_allow = virtio_transport_seqpacket_allow,
.seqpacket_has_data = virtio_transport_seqpacket_has_data,
.msgzerocopy_allow = virtio_transport_msgzerocopy_allow,
.notify_poll_in = virtio_transport_notify_poll_in,
.notify_poll_out = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,

View File

@ -47,6 +47,10 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
}
static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
static bool vsock_loopback_msgzerocopy_allow(void)
{
return true;
}
static struct virtio_transport loopback_transport = {
.transport = {
@ -79,6 +83,8 @@ static struct virtio_transport loopback_transport = {
.seqpacket_allow = vsock_loopback_seqpacket_allow,
.seqpacket_has_data = virtio_transport_seqpacket_has_data,
.msgzerocopy_allow = vsock_loopback_msgzerocopy_allow,
.notify_poll_in = virtio_transport_notify_poll_in,
.notify_poll_out = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,

View File

@ -3,3 +3,4 @@
vsock_test
vsock_diag_test
vsock_perf
vsock_uring_test

View File

@ -1,12 +1,15 @@
# SPDX-License-Identifier: GPL-2.0-only
all: test vsock_perf
test: vsock_test vsock_diag_test
vsock_test: vsock_test.o timeout.o control.o util.o
test: vsock_test vsock_diag_test vsock_uring_test
vsock_test: vsock_test.o vsock_test_zerocopy.o timeout.o control.o util.o msg_zerocopy_common.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
vsock_perf: vsock_perf.o
vsock_perf: vsock_perf.o msg_zerocopy_common.o
vsock_uring_test: LDLIBS = -luring
vsock_uring_test: control.o util.o vsock_uring_test.o timeout.o msg_zerocopy_common.o
CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
clean:
${RM} *.o *.d vsock_test vsock_diag_test vsock_perf
${RM} *.o *.d vsock_test vsock_diag_test vsock_perf vsock_uring_test
-include *.d

View File

@ -0,0 +1,87 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Some common code for MSG_ZEROCOPY logic
*
* Copyright (C) 2023 SberDevices.
*
* Author: Arseniy Krasnov <avkrasnov@salutedevices.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <linux/errqueue.h>
#include "msg_zerocopy_common.h"
void enable_so_zerocopy(int fd)
{
int val = 1;
if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val))) {
perror("setsockopt");
exit(EXIT_FAILURE);
}
}
void vsock_recv_completion(int fd, const bool *zerocopied)
{
struct sock_extended_err *serr;
struct msghdr msg = { 0 };
char cmsg_data[128];
struct cmsghdr *cm;
ssize_t res;
msg.msg_control = cmsg_data;
msg.msg_controllen = sizeof(cmsg_data);
res = recvmsg(fd, &msg, MSG_ERRQUEUE);
if (res) {
fprintf(stderr, "failed to read error queue: %zi\n", res);
exit(EXIT_FAILURE);
}
cm = CMSG_FIRSTHDR(&msg);
if (!cm) {
fprintf(stderr, "cmsg: no cmsg\n");
exit(EXIT_FAILURE);
}
if (cm->cmsg_level != SOL_VSOCK) {
fprintf(stderr, "cmsg: unexpected 'cmsg_level'\n");
exit(EXIT_FAILURE);
}
if (cm->cmsg_type != VSOCK_RECVERR) {
fprintf(stderr, "cmsg: unexpected 'cmsg_type'\n");
exit(EXIT_FAILURE);
}
serr = (void *)CMSG_DATA(cm);
if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
fprintf(stderr, "serr: wrong origin: %u\n", serr->ee_origin);
exit(EXIT_FAILURE);
}
if (serr->ee_errno) {
fprintf(stderr, "serr: wrong error code: %u\n", serr->ee_errno);
exit(EXIT_FAILURE);
}
/* This flag is used for tests, to check that transmission was
* performed as expected: zerocopy or fallback to copy. If NULL
* - don't care.
*/
if (!zerocopied)
return;
if (*zerocopied && (serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED)) {
fprintf(stderr, "serr: was copy instead of zerocopy\n");
exit(EXIT_FAILURE);
}
if (!*zerocopied && !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED)) {
fprintf(stderr, "serr: was zerocopy instead of copy\n");
exit(EXIT_FAILURE);
}
}

View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef MSG_ZEROCOPY_COMMON_H
#define MSG_ZEROCOPY_COMMON_H
#include <stdbool.h>
#ifndef SOL_VSOCK
#define SOL_VSOCK 287
#endif
#ifndef VSOCK_RECVERR
#define VSOCK_RECVERR 1
#endif
void enable_so_zerocopy(int fd);
void vsock_recv_completion(int fd, const bool *zerocopied);
#endif /* MSG_ZEROCOPY_COMMON_H */

View File

@ -11,10 +11,12 @@
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include <assert.h>
#include <sys/epoll.h>
#include <sys/mman.h>
#include "timeout.h"
#include "control.h"
@ -444,3 +446,134 @@ unsigned long hash_djb2(const void *data, size_t len)
return hash;
}
size_t iovec_bytes(const struct iovec *iov, size_t iovnum)
{
size_t bytes;
int i;
for (bytes = 0, i = 0; i < iovnum; i++)
bytes += iov[i].iov_len;
return bytes;
}
unsigned long iovec_hash_djb2(const struct iovec *iov, size_t iovnum)
{
unsigned long hash;
size_t iov_bytes;
size_t offs;
void *tmp;
int i;
iov_bytes = iovec_bytes(iov, iovnum);
tmp = malloc(iov_bytes);
if (!tmp) {
perror("malloc");
exit(EXIT_FAILURE);
}
for (offs = 0, i = 0; i < iovnum; i++) {
memcpy(tmp + offs, iov[i].iov_base, iov[i].iov_len);
offs += iov[i].iov_len;
}
hash = hash_djb2(tmp, iov_bytes);
free(tmp);
return hash;
}
/* Allocates and returns new 'struct iovec *' according pattern
* in the 'test_iovec'. For each element in the 'test_iovec' it
* allocates new element in the resulting 'iovec'. 'iov_len'
* of the new element is copied from 'test_iovec'. 'iov_base' is
* allocated depending on the 'iov_base' of 'test_iovec':
*
* 'iov_base' == NULL -> valid buf: mmap('iov_len').
*
* 'iov_base' == MAP_FAILED -> invalid buf:
* mmap('iov_len'), then munmap('iov_len').
* 'iov_base' still contains result of
* mmap().
*
* 'iov_base' == number -> unaligned valid buf:
* mmap('iov_len') + number.
*
* 'iovnum' is number of elements in 'test_iovec'.
*
* Returns new 'iovec' or calls 'exit()' on error.
*/
struct iovec *alloc_test_iovec(const struct iovec *test_iovec, int iovnum)
{
struct iovec *iovec;
int i;
iovec = malloc(sizeof(*iovec) * iovnum);
if (!iovec) {
perror("malloc");
exit(EXIT_FAILURE);
}
for (i = 0; i < iovnum; i++) {
iovec[i].iov_len = test_iovec[i].iov_len;
iovec[i].iov_base = mmap(NULL, iovec[i].iov_len,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE,
-1, 0);
if (iovec[i].iov_base == MAP_FAILED) {
perror("mmap");
exit(EXIT_FAILURE);
}
if (test_iovec[i].iov_base != MAP_FAILED)
iovec[i].iov_base += (uintptr_t)test_iovec[i].iov_base;
}
/* Unmap "invalid" elements. */
for (i = 0; i < iovnum; i++) {
if (test_iovec[i].iov_base == MAP_FAILED) {
if (munmap(iovec[i].iov_base, iovec[i].iov_len)) {
perror("munmap");
exit(EXIT_FAILURE);
}
}
}
for (i = 0; i < iovnum; i++) {
int j;
if (test_iovec[i].iov_base == MAP_FAILED)
continue;
for (j = 0; j < iovec[i].iov_len; j++)
((uint8_t *)iovec[i].iov_base)[j] = rand() & 0xff;
}
return iovec;
}
/* Frees 'iovec *', previously allocated by 'alloc_test_iovec()'.
* On error calls 'exit()'.
*/
void free_test_iovec(const struct iovec *test_iovec,
struct iovec *iovec, int iovnum)
{
int i;
for (i = 0; i < iovnum; i++) {
if (test_iovec[i].iov_base != MAP_FAILED) {
if (test_iovec[i].iov_base)
iovec[i].iov_base -= (uintptr_t)test_iovec[i].iov_base;
if (munmap(iovec[i].iov_base, iovec[i].iov_len)) {
perror("munmap");
exit(EXIT_FAILURE);
}
}
}
free(iovec);
}

View File

@ -53,4 +53,9 @@ void list_tests(const struct test_case *test_cases);
void skip_test(struct test_case *test_cases, size_t test_cases_len,
const char *test_id_str);
unsigned long hash_djb2(const void *data, size_t len);
size_t iovec_bytes(const struct iovec *iov, size_t iovnum);
unsigned long iovec_hash_djb2(const struct iovec *iov, size_t iovnum);
struct iovec *alloc_test_iovec(const struct iovec *test_iovec, int iovnum);
void free_test_iovec(const struct iovec *test_iovec,
struct iovec *iovec, int iovnum);
#endif /* UTIL_H */

View File

@ -18,6 +18,9 @@
#include <poll.h>
#include <sys/socket.h>
#include <linux/vm_sockets.h>
#include <sys/mman.h>
#include "msg_zerocopy_common.h"
#define DEFAULT_BUF_SIZE_BYTES (128 * 1024)
#define DEFAULT_TO_SEND_BYTES (64 * 1024)
@ -31,6 +34,7 @@
static unsigned int port = DEFAULT_PORT;
static unsigned long buf_size_bytes = DEFAULT_BUF_SIZE_BYTES;
static unsigned long vsock_buf_bytes = DEFAULT_VSOCK_BUF_BYTES;
static bool zerocopy;
static void error(const char *s)
{
@ -252,10 +256,15 @@ static void run_sender(int peer_cid, unsigned long to_send_bytes)
time_t tx_begin_ns;
time_t tx_total_ns;
size_t total_send;
time_t time_in_send;
void *data;
int fd;
printf("Run as sender\n");
if (zerocopy)
printf("Run as sender MSG_ZEROCOPY\n");
else
printf("Run as sender\n");
printf("Connect to %i:%u\n", peer_cid, port);
printf("Send %lu bytes\n", to_send_bytes);
printf("TX buffer %lu bytes\n", buf_size_bytes);
@ -265,38 +274,82 @@ static void run_sender(int peer_cid, unsigned long to_send_bytes)
if (fd < 0)
exit(EXIT_FAILURE);
data = malloc(buf_size_bytes);
if (zerocopy) {
enable_so_zerocopy(fd);
if (!data) {
fprintf(stderr, "'malloc()' failed\n");
exit(EXIT_FAILURE);
data = mmap(NULL, buf_size_bytes, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (data == MAP_FAILED) {
perror("mmap");
exit(EXIT_FAILURE);
}
} else {
data = malloc(buf_size_bytes);
if (!data) {
fprintf(stderr, "'malloc()' failed\n");
exit(EXIT_FAILURE);
}
}
memset(data, 0, buf_size_bytes);
total_send = 0;
time_in_send = 0;
tx_begin_ns = current_nsec();
while (total_send < to_send_bytes) {
ssize_t sent;
size_t rest_bytes;
time_t before;
sent = write(fd, data, buf_size_bytes);
rest_bytes = to_send_bytes - total_send;
before = current_nsec();
sent = send(fd, data, (rest_bytes > buf_size_bytes) ?
buf_size_bytes : rest_bytes,
zerocopy ? MSG_ZEROCOPY : 0);
time_in_send += (current_nsec() - before);
if (sent <= 0)
error("write");
total_send += sent;
if (zerocopy) {
struct pollfd fds = { 0 };
fds.fd = fd;
if (poll(&fds, 1, -1) < 0) {
perror("poll");
exit(EXIT_FAILURE);
}
if (!(fds.revents & POLLERR)) {
fprintf(stderr, "POLLERR expected\n");
exit(EXIT_FAILURE);
}
vsock_recv_completion(fd, NULL);
}
}
tx_total_ns = current_nsec() - tx_begin_ns;
printf("total bytes sent: %zu\n", total_send);
printf("tx performance: %f Gbits/s\n",
get_gbps(total_send * 8, tx_total_ns));
printf("total time in 'write()': %f sec\n",
get_gbps(total_send * 8, time_in_send));
printf("total time in tx loop: %f sec\n",
(float)tx_total_ns / NSEC_PER_SEC);
printf("time in 'send()': %f sec\n",
(float)time_in_send / NSEC_PER_SEC);
close(fd);
free(data);
if (zerocopy)
munmap(data, buf_size_bytes);
else
free(data);
}
static const char optstring[] = "";
@ -336,6 +389,11 @@ static const struct option longopts[] = {
.has_arg = required_argument,
.val = 'R',
},
{
.name = "zerocopy",
.has_arg = no_argument,
.val = 'Z',
},
{},
};
@ -351,6 +409,7 @@ static void usage(void)
" --help This message\n"
" --sender <cid> Sender mode (receiver default)\n"
" <cid> of the receiver to connect to\n"
" --zerocopy Enable zerocopy (for sender mode only)\n"
" --port <port> Port (default %d)\n"
" --bytes <bytes>KMG Bytes to send (default %d)\n"
" --buf-size <bytes>KMG Data buffer size (default %d). In sender mode\n"
@ -413,6 +472,9 @@ int main(int argc, char **argv)
case 'H': /* Help. */
usage();
break;
case 'Z': /* Zerocopy. */
zerocopy = true;
break;
default:
usage();
}

View File

@ -21,6 +21,7 @@
#include <poll.h>
#include <signal.h>
#include "vsock_test_zerocopy.h"
#include "timeout.h"
#include "control.h"
#include "util.h"
@ -1269,6 +1270,21 @@ static struct test_case test_cases[] = {
.run_client = test_stream_shutrd_client,
.run_server = test_stream_shutrd_server,
},
{
.name = "SOCK_STREAM MSG_ZEROCOPY",
.run_client = test_stream_msgzcopy_client,
.run_server = test_stream_msgzcopy_server,
},
{
.name = "SOCK_SEQPACKET MSG_ZEROCOPY",
.run_client = test_seqpacket_msgzcopy_client,
.run_server = test_seqpacket_msgzcopy_server,
},
{
.name = "SOCK_STREAM MSG_ZEROCOPY empty MSG_ERRQUEUE",
.run_client = test_stream_msgzcopy_empty_errq_client,
.run_server = test_stream_msgzcopy_empty_errq_server,
},
{},
};

View File

@ -0,0 +1,358 @@
// SPDX-License-Identifier: GPL-2.0-only
/* MSG_ZEROCOPY feature tests for vsock
*
* Copyright (C) 2023 SberDevices.
*
* Author: Arseniy Krasnov <avkrasnov@salutedevices.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <poll.h>
#include <linux/errqueue.h>
#include <linux/kernel.h>
#include <errno.h>
#include "control.h"
#include "vsock_test_zerocopy.h"
#include "msg_zerocopy_common.h"
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
#define VSOCK_TEST_DATA_MAX_IOV 3
struct vsock_test_data {
/* This test case if for SOCK_STREAM only. */
bool stream_only;
/* Data must be zerocopied. This field is checked against
* field 'ee_code' of the 'struct sock_extended_err', which
* contains bit to detect that zerocopy transmission was
* fallbacked to copy mode.
*/
bool zerocopied;
/* Enable SO_ZEROCOPY option on the socket. Without enabled
* SO_ZEROCOPY, every MSG_ZEROCOPY transmission will behave
* like without MSG_ZEROCOPY flag.
*/
bool so_zerocopy;
/* 'errno' after 'sendmsg()' call. */
int sendmsg_errno;
/* Number of valid elements in 'vecs'. */
int vecs_cnt;
struct iovec vecs[VSOCK_TEST_DATA_MAX_IOV];
};
static struct vsock_test_data test_data_array[] = {
/* Last element has non-page aligned size. */
{
.zerocopied = true,
.so_zerocopy = true,
.sendmsg_errno = 0,
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE },
{ NULL, PAGE_SIZE },
{ NULL, 200 }
}
},
/* All elements have page aligned base and size. */
{
.zerocopied = true,
.so_zerocopy = true,
.sendmsg_errno = 0,
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE },
{ NULL, PAGE_SIZE * 2 },
{ NULL, PAGE_SIZE * 3 }
}
},
/* All elements have page aligned base and size. But
* data length is bigger than 64Kb.
*/
{
.zerocopied = true,
.so_zerocopy = true,
.sendmsg_errno = 0,
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE * 16 },
{ NULL, PAGE_SIZE * 16 },
{ NULL, PAGE_SIZE * 16 }
}
},
/* Middle element has both non-page aligned base and size. */
{
.zerocopied = true,
.so_zerocopy = true,
.sendmsg_errno = 0,
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE },
{ (void *)1, 100 },
{ NULL, PAGE_SIZE }
}
},
/* Middle element is unmapped. */
{
.zerocopied = false,
.so_zerocopy = true,
.sendmsg_errno = ENOMEM,
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE },
{ MAP_FAILED, PAGE_SIZE },
{ NULL, PAGE_SIZE }
}
},
/* Valid data, but SO_ZEROCOPY is off. This
* will trigger fallback to copy.
*/
{
.zerocopied = false,
.so_zerocopy = false,
.sendmsg_errno = 0,
.vecs_cnt = 1,
{
{ NULL, PAGE_SIZE }
}
},
/* Valid data, but message is bigger than peer's
* buffer, so this will trigger fallback to copy.
* This test is for SOCK_STREAM only, because
* for SOCK_SEQPACKET, 'sendmsg()' returns EMSGSIZE.
*/
{
.stream_only = true,
.zerocopied = false,
.so_zerocopy = true,
.sendmsg_errno = 0,
.vecs_cnt = 1,
{
{ NULL, 100 * PAGE_SIZE }
}
},
};
#define POLL_TIMEOUT_MS 100
static void test_client(const struct test_opts *opts,
const struct vsock_test_data *test_data,
bool sock_seqpacket)
{
struct pollfd fds = { 0 };
struct msghdr msg = { 0 };
ssize_t sendmsg_res;
struct iovec *iovec;
int fd;
if (sock_seqpacket)
fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
else
fd = vsock_stream_connect(opts->peer_cid, 1234);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
}
if (test_data->so_zerocopy)
enable_so_zerocopy(fd);
iovec = alloc_test_iovec(test_data->vecs, test_data->vecs_cnt);
msg.msg_iov = iovec;
msg.msg_iovlen = test_data->vecs_cnt;
errno = 0;
sendmsg_res = sendmsg(fd, &msg, MSG_ZEROCOPY);
if (errno != test_data->sendmsg_errno) {
fprintf(stderr, "expected 'errno' == %i, got %i\n",
test_data->sendmsg_errno, errno);
exit(EXIT_FAILURE);
}
if (!errno) {
if (sendmsg_res != iovec_bytes(iovec, test_data->vecs_cnt)) {
fprintf(stderr, "expected 'sendmsg()' == %li, got %li\n",
iovec_bytes(iovec, test_data->vecs_cnt),
sendmsg_res);
exit(EXIT_FAILURE);
}
}
fds.fd = fd;
fds.events = 0;
if (poll(&fds, 1, POLL_TIMEOUT_MS) < 0) {
perror("poll");
exit(EXIT_FAILURE);
}
if (fds.revents & POLLERR) {
vsock_recv_completion(fd, &test_data->zerocopied);
} else if (test_data->so_zerocopy && !test_data->sendmsg_errno) {
/* If we don't have data in the error queue, but
* SO_ZEROCOPY was enabled and 'sendmsg()' was
* successful - this is an error.
*/
fprintf(stderr, "POLLERR expected\n");
exit(EXIT_FAILURE);
}
if (!test_data->sendmsg_errno)
control_writeulong(iovec_hash_djb2(iovec, test_data->vecs_cnt));
else
control_writeulong(0);
control_writeln("DONE");
free_test_iovec(test_data->vecs, iovec, test_data->vecs_cnt);
close(fd);
}
void test_stream_msgzcopy_client(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++)
test_client(opts, &test_data_array[i], false);
}
void test_seqpacket_msgzcopy_client(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++) {
if (test_data_array[i].stream_only)
continue;
test_client(opts, &test_data_array[i], true);
}
}
static void test_server(const struct test_opts *opts,
const struct vsock_test_data *test_data,
bool sock_seqpacket)
{
unsigned long remote_hash;
unsigned long local_hash;
ssize_t total_bytes_rec;
unsigned char *data;
size_t data_len;
int fd;
if (sock_seqpacket)
fd = vsock_seqpacket_accept(VMADDR_CID_ANY, 1234, NULL);
else
fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
if (fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
}
data_len = iovec_bytes(test_data->vecs, test_data->vecs_cnt);
data = malloc(data_len);
if (!data) {
perror("malloc");
exit(EXIT_FAILURE);
}
total_bytes_rec = 0;
while (total_bytes_rec != data_len) {
ssize_t bytes_rec;
bytes_rec = read(fd, data + total_bytes_rec,
data_len - total_bytes_rec);
if (bytes_rec <= 0)
break;
total_bytes_rec += bytes_rec;
}
if (test_data->sendmsg_errno == 0)
local_hash = hash_djb2(data, data_len);
else
local_hash = 0;
free(data);
/* Waiting for some result. */
remote_hash = control_readulong();
if (remote_hash != local_hash) {
fprintf(stderr, "hash mismatch\n");
exit(EXIT_FAILURE);
}
control_expectln("DONE");
close(fd);
}
void test_stream_msgzcopy_server(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++)
test_server(opts, &test_data_array[i], false);
}
void test_seqpacket_msgzcopy_server(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++) {
if (test_data_array[i].stream_only)
continue;
test_server(opts, &test_data_array[i], true);
}
}
void test_stream_msgzcopy_empty_errq_client(const struct test_opts *opts)
{
struct msghdr msg = { 0 };
char cmsg_data[128];
ssize_t res;
int fd;
fd = vsock_stream_connect(opts->peer_cid, 1234);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
}
msg.msg_control = cmsg_data;
msg.msg_controllen = sizeof(cmsg_data);
res = recvmsg(fd, &msg, MSG_ERRQUEUE);
if (res != -1) {
fprintf(stderr, "expected 'recvmsg(2)' failure, got %zi\n",
res);
exit(EXIT_FAILURE);
}
control_writeln("DONE");
close(fd);
}
void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts)
{
int fd;
fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
if (fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
}
control_expectln("DONE");
close(fd);
}

View File

@ -0,0 +1,15 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef VSOCK_TEST_ZEROCOPY_H
#define VSOCK_TEST_ZEROCOPY_H
#include "util.h"
void test_stream_msgzcopy_client(const struct test_opts *opts);
void test_stream_msgzcopy_server(const struct test_opts *opts);
void test_seqpacket_msgzcopy_client(const struct test_opts *opts);
void test_seqpacket_msgzcopy_server(const struct test_opts *opts);
void test_stream_msgzcopy_empty_errq_client(const struct test_opts *opts);
void test_stream_msgzcopy_empty_errq_server(const struct test_opts *opts);
#endif /* VSOCK_TEST_ZEROCOPY_H */

View File

@ -0,0 +1,342 @@
// SPDX-License-Identifier: GPL-2.0-only
/* io_uring tests for vsock
*
* Copyright (C) 2023 SberDevices.
*
* Author: Arseniy Krasnov <avkrasnov@salutedevices.com>
*/
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <liburing.h>
#include <unistd.h>
#include <sys/mman.h>
#include <linux/kernel.h>
#include <error.h>
#include "util.h"
#include "control.h"
#include "msg_zerocopy_common.h"
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
#define RING_ENTRIES_NUM 4
#define VSOCK_TEST_DATA_MAX_IOV 3
struct vsock_io_uring_test {
/* Number of valid elements in 'vecs'. */
int vecs_cnt;
struct iovec vecs[VSOCK_TEST_DATA_MAX_IOV];
};
static struct vsock_io_uring_test test_data_array[] = {
/* All elements have page aligned base and size. */
{
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE },
{ NULL, 2 * PAGE_SIZE },
{ NULL, 3 * PAGE_SIZE },
}
},
/* Middle element has both non-page aligned base and size. */
{
.vecs_cnt = 3,
{
{ NULL, PAGE_SIZE },
{ (void *)1, 200 },
{ NULL, 3 * PAGE_SIZE },
}
}
};
static void vsock_io_uring_client(const struct test_opts *opts,
const struct vsock_io_uring_test *test_data,
bool msg_zerocopy)
{
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
struct io_uring ring;
struct iovec *iovec;
struct msghdr msg;
int fd;
fd = vsock_stream_connect(opts->peer_cid, 1234);
if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
}
if (msg_zerocopy)
enable_so_zerocopy(fd);
iovec = alloc_test_iovec(test_data->vecs, test_data->vecs_cnt);
if (io_uring_queue_init(RING_ENTRIES_NUM, &ring, 0))
error(1, errno, "io_uring_queue_init");
if (io_uring_register_buffers(&ring, iovec, test_data->vecs_cnt))
error(1, errno, "io_uring_register_buffers");
memset(&msg, 0, sizeof(msg));
msg.msg_iov = iovec;
msg.msg_iovlen = test_data->vecs_cnt;
sqe = io_uring_get_sqe(&ring);
if (msg_zerocopy)
io_uring_prep_sendmsg_zc(sqe, fd, &msg, 0);
else
io_uring_prep_sendmsg(sqe, fd, &msg, 0);
if (io_uring_submit(&ring) != 1)
error(1, errno, "io_uring_submit");
if (io_uring_wait_cqe(&ring, &cqe))
error(1, errno, "io_uring_wait_cqe");
io_uring_cqe_seen(&ring, cqe);
control_writeulong(iovec_hash_djb2(iovec, test_data->vecs_cnt));
control_writeln("DONE");
io_uring_queue_exit(&ring);
free_test_iovec(test_data->vecs, iovec, test_data->vecs_cnt);
close(fd);
}
static void vsock_io_uring_server(const struct test_opts *opts,
const struct vsock_io_uring_test *test_data)
{
unsigned long remote_hash;
unsigned long local_hash;
struct io_uring ring;
size_t data_len;
size_t recv_len;
void *data;
int fd;
fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
if (fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
}
data_len = iovec_bytes(test_data->vecs, test_data->vecs_cnt);
data = malloc(data_len);
if (!data) {
perror("malloc");
exit(EXIT_FAILURE);
}
if (io_uring_queue_init(RING_ENTRIES_NUM, &ring, 0))
error(1, errno, "io_uring_queue_init");
recv_len = 0;
while (recv_len < data_len) {
struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe;
struct iovec iovec;
sqe = io_uring_get_sqe(&ring);
iovec.iov_base = data + recv_len;
iovec.iov_len = data_len;
io_uring_prep_readv(sqe, fd, &iovec, 1, 0);
if (io_uring_submit(&ring) != 1)
error(1, errno, "io_uring_submit");
if (io_uring_wait_cqe(&ring, &cqe))
error(1, errno, "io_uring_wait_cqe");
recv_len += cqe->res;
io_uring_cqe_seen(&ring, cqe);
}
if (recv_len != data_len) {
fprintf(stderr, "expected %zu, got %zu\n", data_len,
recv_len);
exit(EXIT_FAILURE);
}
local_hash = hash_djb2(data, data_len);
remote_hash = control_readulong();
if (remote_hash != local_hash) {
fprintf(stderr, "hash mismatch\n");
exit(EXIT_FAILURE);
}
control_expectln("DONE");
io_uring_queue_exit(&ring);
free(data);
}
void test_stream_uring_server(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++)
vsock_io_uring_server(opts, &test_data_array[i]);
}
void test_stream_uring_client(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++)
vsock_io_uring_client(opts, &test_data_array[i], false);
}
void test_stream_uring_msg_zc_server(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++)
vsock_io_uring_server(opts, &test_data_array[i]);
}
void test_stream_uring_msg_zc_client(const struct test_opts *opts)
{
int i;
for (i = 0; i < ARRAY_SIZE(test_data_array); i++)
vsock_io_uring_client(opts, &test_data_array[i], true);
}
static struct test_case test_cases[] = {
{
.name = "SOCK_STREAM io_uring test",
.run_server = test_stream_uring_server,
.run_client = test_stream_uring_client,
},
{
.name = "SOCK_STREAM io_uring MSG_ZEROCOPY test",
.run_server = test_stream_uring_msg_zc_server,
.run_client = test_stream_uring_msg_zc_client,
},
{},
};
static const char optstring[] = "";
static const struct option longopts[] = {
{
.name = "control-host",
.has_arg = required_argument,
.val = 'H',
},
{
.name = "control-port",
.has_arg = required_argument,
.val = 'P',
},
{
.name = "mode",
.has_arg = required_argument,
.val = 'm',
},
{
.name = "peer-cid",
.has_arg = required_argument,
.val = 'p',
},
{
.name = "help",
.has_arg = no_argument,
.val = '?',
},
{},
};
static void usage(void)
{
fprintf(stderr, "Usage: vsock_uring_test [--help] [--control-host=<host>] --control-port=<port> --mode=client|server --peer-cid=<cid>\n"
"\n"
" Server: vsock_uring_test --control-port=1234 --mode=server --peer-cid=3\n"
" Client: vsock_uring_test --control-host=192.168.0.1 --control-port=1234 --mode=client --peer-cid=2\n"
"\n"
"Run transmission tests using io_uring. Usage is the same as\n"
"in ./vsock_test\n"
"\n"
"Options:\n"
" --help This help message\n"
" --control-host <host> Server IP address to connect to\n"
" --control-port <port> Server port to listen on/connect to\n"
" --mode client|server Server or client mode\n"
" --peer-cid <cid> CID of the other side\n"
);
exit(EXIT_FAILURE);
}
int main(int argc, char **argv)
{
const char *control_host = NULL;
const char *control_port = NULL;
struct test_opts opts = {
.mode = TEST_MODE_UNSET,
.peer_cid = VMADDR_CID_ANY,
};
init_signals();
for (;;) {
int opt = getopt_long(argc, argv, optstring, longopts, NULL);
if (opt == -1)
break;
switch (opt) {
case 'H':
control_host = optarg;
break;
case 'm':
if (strcmp(optarg, "client") == 0) {
opts.mode = TEST_MODE_CLIENT;
} else if (strcmp(optarg, "server") == 0) {
opts.mode = TEST_MODE_SERVER;
} else {
fprintf(stderr, "--mode must be \"client\" or \"server\"\n");
return EXIT_FAILURE;
}
break;
case 'p':
opts.peer_cid = parse_cid(optarg);
break;
case 'P':
control_port = optarg;
break;
case '?':
default:
usage();
}
}
if (!control_port)
usage();
if (opts.mode == TEST_MODE_UNSET)
usage();
if (opts.peer_cid == VMADDR_CID_ANY)
usage();
if (!control_host) {
if (opts.mode != TEST_MODE_SERVER)
usage();
control_host = "0.0.0.0";
}
control_init(control_host, control_port,
opts.mode == TEST_MODE_SERVER);
run_tests(test_cases, &opts);
control_cleanup();
return 0;
}