qemu/hw/net/spapr_llan.c

822 lines
25 KiB
C
Raw Normal View History

/*
* QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
*
* PAPR Inter-VM Logical Lan, aka ibmveth
*
* Copyright (c) 2010,2011 David Gibson, IBM Corporation.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/
#include "qemu/osdep.h"
#include "qemu-common.h"
#include "cpu.h"
#include "hw/hw.h"
#include "qemu/log.h"
#include "net/net.h"
#include "hw/qdev.h"
#include "hw/ppc/spapr.h"
#include "hw/ppc/spapr_vio.h"
#include "sysemu/sysemu.h"
#include <libfdt.h>
#define ETH_ALEN 6
#define MAX_PACKET_SIZE 65536
/*#define DEBUG*/
#ifdef DEBUG
#define DPRINTF(fmt...) do { fprintf(stderr, fmt); } while (0)
#else
#define DPRINTF(fmt...)
#endif
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
/* Compatibility flags for migration */
#define SPAPRVLAN_FLAG_RX_BUF_POOLS_BIT 0
#define SPAPRVLAN_FLAG_RX_BUF_POOLS (1 << SPAPRVLAN_FLAG_RX_BUF_POOLS_BIT)
/*
* Virtual LAN device
*/
typedef uint64_t vlan_bd_t;
#define VLAN_BD_VALID 0x8000000000000000ULL
#define VLAN_BD_TOGGLE 0x4000000000000000ULL
#define VLAN_BD_NO_CSUM 0x0200000000000000ULL
#define VLAN_BD_CSUM_GOOD 0x0100000000000000ULL
#define VLAN_BD_LEN_MASK 0x00ffffff00000000ULL
#define VLAN_BD_LEN(bd) (((bd) & VLAN_BD_LEN_MASK) >> 32)
#define VLAN_BD_ADDR_MASK 0x00000000ffffffffULL
#define VLAN_BD_ADDR(bd) ((bd) & VLAN_BD_ADDR_MASK)
#define VLAN_VALID_BD(addr, len) (VLAN_BD_VALID | \
(((len) << 32) & VLAN_BD_LEN_MASK) | \
(addr & VLAN_BD_ADDR_MASK))
#define VLAN_RXQC_TOGGLE 0x80
#define VLAN_RXQC_VALID 0x40
#define VLAN_RXQC_NO_CSUM 0x02
#define VLAN_RXQC_CSUM_GOOD 0x01
#define VLAN_RQ_ALIGNMENT 16
#define VLAN_RXQ_BD_OFF 0
#define VLAN_FILTER_BD_OFF 8
#define VLAN_RX_BDS_OFF 16
/*
* The final 8 bytes of the buffer list is a counter of frames dropped
* because there was not a buffer in the buffer list capable of holding
* the frame. We must avoid it, or the operating system will report garbage
* for this statistic.
*/
#define VLAN_RX_BDS_LEN (SPAPR_TCE_PAGE_SIZE - VLAN_RX_BDS_OFF - 8)
#define VLAN_MAX_BUFS (VLAN_RX_BDS_LEN / 8)
#define TYPE_VIO_SPAPR_VLAN_DEVICE "spapr-vlan"
#define VIO_SPAPR_VLAN_DEVICE(obj) \
OBJECT_CHECK(VIOsPAPRVLANDevice, (obj), TYPE_VIO_SPAPR_VLAN_DEVICE)
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
#define RX_POOL_MAX_BDS 4096
#define RX_MAX_POOLS 5
typedef struct {
int32_t bufsize;
int32_t count;
vlan_bd_t bds[RX_POOL_MAX_BDS];
} RxBufPool;
typedef struct VIOsPAPRVLANDevice {
VIOsPAPRDevice sdev;
NICConf nicconf;
NICState *nic;
bool isopen;
hwaddr buf_list;
uint32_t add_buf_ptr, use_buf_ptr, rx_bufs;
hwaddr rxq_ptr;
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
uint32_t compat_flags; /* Compatability flags for migration */
RxBufPool *rx_pool[RX_MAX_POOLS]; /* Receive buffer descriptor pools */
} VIOsPAPRVLANDevice;
static int spapr_vlan_can_receive(NetClientState *nc)
{
VIOsPAPRVLANDevice *dev = qemu_get_nic_opaque(nc);
return (dev->isopen && dev->rx_bufs > 0);
}
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
/**
* Get buffer descriptor from one of our receive buffer pools
*/
static vlan_bd_t spapr_vlan_get_rx_bd_from_pool(VIOsPAPRVLANDevice *dev,
size_t size)
{
vlan_bd_t bd;
int pool;
for (pool = 0; pool < RX_MAX_POOLS; pool++) {
if (dev->rx_pool[pool]->count > 0 &&
dev->rx_pool[pool]->bufsize >= size + 8) {
break;
}
}
if (pool == RX_MAX_POOLS) {
/* Failed to find a suitable buffer */
return 0;
}
DPRINTF("Found buffer: pool=%d count=%d rxbufs=%d\n", pool,
dev->rx_pool[pool]->count, dev->rx_bufs);
/* Remove the buffer from the pool */
dev->rx_pool[pool]->count--;
bd = dev->rx_pool[pool]->bds[dev->rx_pool[pool]->count];
dev->rx_pool[pool]->bds[dev->rx_pool[pool]->count] = 0;
return bd;
}
/**
* Get buffer descriptor from the receive buffer list page that has been
* supplied by the guest with the H_REGISTER_LOGICAL_LAN call
*/
static vlan_bd_t spapr_vlan_get_rx_bd_from_page(VIOsPAPRVLANDevice *dev,
size_t size)
{
int buf_ptr = dev->use_buf_ptr;
vlan_bd_t bd;
do {
buf_ptr += 8;
if (buf_ptr >= VLAN_RX_BDS_LEN + VLAN_RX_BDS_OFF) {
buf_ptr = VLAN_RX_BDS_OFF;
}
bd = vio_ldq(&dev->sdev, dev->buf_list + buf_ptr);
DPRINTF("use_buf_ptr=%d bd=0x%016llx\n",
buf_ptr, (unsigned long long)bd);
} while ((!(bd & VLAN_BD_VALID) || VLAN_BD_LEN(bd) < size + 8)
&& buf_ptr != dev->use_buf_ptr);
if (!(bd & VLAN_BD_VALID) || VLAN_BD_LEN(bd) < size + 8) {
/* Failed to find a suitable buffer */
return 0;
}
/* Remove the buffer from the pool */
dev->use_buf_ptr = buf_ptr;
vio_stq(&dev->sdev, dev->buf_list + dev->use_buf_ptr, 0);
DPRINTF("Found buffer: ptr=%d rxbufs=%d\n", dev->use_buf_ptr, dev->rx_bufs);
return bd;
}
static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf,
size_t size)
{
VIOsPAPRVLANDevice *dev = qemu_get_nic_opaque(nc);
VIOsPAPRDevice *sdev = VIO_SPAPR_DEVICE(dev);
vlan_bd_t rxq_bd = vio_ldq(sdev, dev->buf_list + VLAN_RXQ_BD_OFF);
vlan_bd_t bd;
uint64_t handle;
uint8_t control;
DPRINTF("spapr_vlan_receive() [%s] rx_bufs=%d\n", sdev->qdev.id,
dev->rx_bufs);
if (!dev->isopen) {
return -1;
}
if (!dev->rx_bufs) {
return -1;
}
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
bd = spapr_vlan_get_rx_bd_from_pool(dev, size);
} else {
bd = spapr_vlan_get_rx_bd_from_page(dev, size);
}
if (!bd) {
return -1;
}
dev->rx_bufs--;
/* Transfer the packet data */
if (spapr_vio_dma_write(sdev, VLAN_BD_ADDR(bd) + 8, buf, size) < 0) {
return -1;
}
DPRINTF("spapr_vlan_receive: DMA write completed\n");
/* Update the receive queue */
control = VLAN_RXQC_TOGGLE | VLAN_RXQC_VALID;
if (rxq_bd & VLAN_BD_TOGGLE) {
control ^= VLAN_RXQC_TOGGLE;
}
handle = vio_ldq(sdev, VLAN_BD_ADDR(bd));
vio_stq(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr + 8, handle);
vio_stl(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr + 4, size);
vio_sth(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr + 2, 8);
vio_stb(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr, control);
DPRINTF("wrote rxq entry (ptr=0x%llx): 0x%016llx 0x%016llx\n",
(unsigned long long)dev->rxq_ptr,
(unsigned long long)vio_ldq(sdev, VLAN_BD_ADDR(rxq_bd) +
dev->rxq_ptr),
(unsigned long long)vio_ldq(sdev, VLAN_BD_ADDR(rxq_bd) +
dev->rxq_ptr + 8));
dev->rxq_ptr += 16;
if (dev->rxq_ptr >= VLAN_BD_LEN(rxq_bd)) {
dev->rxq_ptr = 0;
vio_stq(sdev, dev->buf_list + VLAN_RXQ_BD_OFF, rxq_bd ^ VLAN_BD_TOGGLE);
}
if (sdev->signal_state & 1) {
qemu_irq_pulse(spapr_vio_qirq(sdev));
}
return size;
}
static NetClientInfo net_spapr_vlan_info = {
.type = NET_CLIENT_OPTIONS_KIND_NIC,
.size = sizeof(NICState),
.can_receive = spapr_vlan_can_receive,
.receive = spapr_vlan_receive,
};
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
static void spapr_vlan_reset_rx_pool(RxBufPool *rxp)
{
/*
* Use INT_MAX as bufsize so that unused buffers are moved to the end
* of the list during the qsort in spapr_vlan_add_rxbuf_to_pool() later.
*/
rxp->bufsize = INT_MAX;
rxp->count = 0;
memset(rxp->bds, 0, sizeof(rxp->bds));
}
static void spapr_vlan_reset(VIOsPAPRDevice *sdev)
{
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
int i;
dev->buf_list = 0;
dev->rx_bufs = 0;
dev->isopen = 0;
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
for (i = 0; i < RX_MAX_POOLS; i++) {
spapr_vlan_reset_rx_pool(dev->rx_pool[i]);
}
}
}
static void spapr_vlan_realize(VIOsPAPRDevice *sdev, Error **errp)
{
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
qemu_macaddr_default_if_unset(&dev->nicconf.macaddr);
dev->nic = qemu_new_nic(&net_spapr_vlan_info, &dev->nicconf,
object_get_typename(OBJECT(sdev)), sdev->qdev.id, dev);
qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
}
static void spapr_vlan_instance_init(Object *obj)
{
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(obj);
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
int i;
device_add_bootindex_property(obj, &dev->nicconf.bootindex,
"bootindex", "",
DEVICE(dev), NULL);
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
for (i = 0; i < RX_MAX_POOLS; i++) {
dev->rx_pool[i] = g_new(RxBufPool, 1);
spapr_vlan_reset_rx_pool(dev->rx_pool[i]);
}
}
}
static void spapr_vlan_instance_finalize(Object *obj)
{
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(obj);
int i;
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
for (i = 0; i < RX_MAX_POOLS; i++) {
g_free(dev->rx_pool[i]);
dev->rx_pool[i] = NULL;
}
}
}
void spapr_vlan_create(VIOsPAPRBus *bus, NICInfo *nd)
{
DeviceState *dev;
dev = qdev_create(&bus->bus, "spapr-vlan");
qdev_set_nic_properties(dev, nd);
qdev_init_nofail(dev);
}
static int spapr_vlan_devnode(VIOsPAPRDevice *dev, void *fdt, int node_off)
{
VIOsPAPRVLANDevice *vdev = VIO_SPAPR_VLAN_DEVICE(dev);
uint8_t padded_mac[8] = {0, 0};
int ret;
/* Some old phyp versions give the mac address in an 8-byte
* property. The kernel driver has an insane workaround for this;
* rather than doing the obvious thing and checking the property
* length, it checks whether the first byte has 0b10 in the low
* bits. If a correct 6-byte property has a different first byte
* the kernel will get the wrong mac address, overrunning its
* buffer in the process (read only, thank goodness).
*
* Here we workaround the kernel workaround by always supplying an
* 8-byte property, with the mac address in the last six bytes */
memcpy(&padded_mac[2], &vdev->nicconf.macaddr, ETH_ALEN);
ret = fdt_setprop(fdt, node_off, "local-mac-address",
padded_mac, sizeof(padded_mac));
if (ret < 0) {
return ret;
}
ret = fdt_setprop_cell(fdt, node_off, "ibm,mac-address-filters", 0);
if (ret < 0) {
return ret;
}
return 0;
}
static int check_bd(VIOsPAPRVLANDevice *dev, vlan_bd_t bd,
target_ulong alignment)
{
if ((VLAN_BD_ADDR(bd) % alignment)
|| (VLAN_BD_LEN(bd) % alignment)) {
return -1;
}
if (!spapr_vio_dma_valid(&dev->sdev, VLAN_BD_ADDR(bd),
VLAN_BD_LEN(bd), DMA_DIRECTION_FROM_DEVICE)
|| !spapr_vio_dma_valid(&dev->sdev, VLAN_BD_ADDR(bd),
VLAN_BD_LEN(bd), DMA_DIRECTION_TO_DEVICE)) {
return -1;
}
return 0;
}
static target_ulong h_register_logical_lan(PowerPCCPU *cpu,
sPAPRMachineState *spapr,
target_ulong opcode,
target_ulong *args)
{
target_ulong reg = args[0];
target_ulong buf_list = args[1];
target_ulong rec_queue = args[2];
target_ulong filter_list = args[3];
VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
vlan_bd_t filter_list_bd;
if (!dev) {
return H_PARAMETER;
}
if (dev->isopen) {
hcall_dprintf("H_REGISTER_LOGICAL_LAN called twice without "
"H_FREE_LOGICAL_LAN\n");
return H_RESOURCE;
}
if (check_bd(dev, VLAN_VALID_BD(buf_list, SPAPR_TCE_PAGE_SIZE),
SPAPR_TCE_PAGE_SIZE) < 0) {
hcall_dprintf("Bad buf_list 0x" TARGET_FMT_lx "\n", buf_list);
return H_PARAMETER;
}
filter_list_bd = VLAN_VALID_BD(filter_list, SPAPR_TCE_PAGE_SIZE);
if (check_bd(dev, filter_list_bd, SPAPR_TCE_PAGE_SIZE) < 0) {
hcall_dprintf("Bad filter_list 0x" TARGET_FMT_lx "\n", filter_list);
return H_PARAMETER;
}
if (!(rec_queue & VLAN_BD_VALID)
|| (check_bd(dev, rec_queue, VLAN_RQ_ALIGNMENT) < 0)) {
hcall_dprintf("Bad receive queue\n");
return H_PARAMETER;
}
dev->buf_list = buf_list;
sdev->signal_state = 0;
rec_queue &= ~VLAN_BD_TOGGLE;
/* Initialize the buffer list */
vio_stq(sdev, buf_list, rec_queue);
vio_stq(sdev, buf_list + 8, filter_list_bd);
spapr_vio_dma_set(sdev, buf_list + VLAN_RX_BDS_OFF, 0,
SPAPR_TCE_PAGE_SIZE - VLAN_RX_BDS_OFF);
dev->add_buf_ptr = VLAN_RX_BDS_OFF - 8;
dev->use_buf_ptr = VLAN_RX_BDS_OFF - 8;
dev->rx_bufs = 0;
dev->rxq_ptr = 0;
/* Initialize the receive queue */
spapr_vio_dma_set(sdev, VLAN_BD_ADDR(rec_queue), 0, VLAN_BD_LEN(rec_queue));
dev->isopen = 1;
qemu_flush_queued_packets(qemu_get_queue(dev->nic));
return H_SUCCESS;
}
static target_ulong h_free_logical_lan(PowerPCCPU *cpu,
sPAPRMachineState *spapr,
target_ulong opcode, target_ulong *args)
{
target_ulong reg = args[0];
VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
if (!dev) {
return H_PARAMETER;
}
if (!dev->isopen) {
hcall_dprintf("H_FREE_LOGICAL_LAN called without "
"H_REGISTER_LOGICAL_LAN\n");
return H_RESOURCE;
}
spapr_vlan_reset(sdev);
return H_SUCCESS;
}
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
/**
* Used for qsort, this function compares two RxBufPools by size.
*/
static int rx_pool_size_compare(const void *p1, const void *p2)
{
const RxBufPool *pool1 = *(RxBufPool **)p1;
const RxBufPool *pool2 = *(RxBufPool **)p2;
if (pool1->bufsize < pool2->bufsize) {
return -1;
}
return pool1->bufsize > pool2->bufsize;
}
/**
* Search for a matching buffer pool with exact matching size,
* or return -1 if no matching pool has been found.
*/
static int spapr_vlan_get_rx_pool_id(VIOsPAPRVLANDevice *dev, int size)
{
int pool;
for (pool = 0; pool < RX_MAX_POOLS; pool++) {
if (dev->rx_pool[pool]->bufsize == size) {
return pool;
}
}
return -1;
}
/**
* Enqueuing receive buffer by adding it to one of our receive buffer pools
*/
static target_long spapr_vlan_add_rxbuf_to_pool(VIOsPAPRVLANDevice *dev,
target_ulong buf)
{
int size = VLAN_BD_LEN(buf);
int pool;
pool = spapr_vlan_get_rx_pool_id(dev, size);
if (pool < 0) {
/*
* No matching pool found? Try to use a new one. If the guest used all
* pools before, but changed the size of one pool inbetween, we might
* need to recycle that pool here (if it's empty already). Thus scan
* all buffer pools now, starting with the last (likely empty) one.
*/
for (pool = RX_MAX_POOLS - 1; pool >= 0 ; pool--) {
if (dev->rx_pool[pool]->count == 0) {
dev->rx_pool[pool]->bufsize = size;
/*
* Sort pools by size so that spapr_vlan_receive()
* can later find the smallest buffer pool easily.
*/
qsort(dev->rx_pool, RX_MAX_POOLS, sizeof(dev->rx_pool[0]),
rx_pool_size_compare);
pool = spapr_vlan_get_rx_pool_id(dev, size);
DPRINTF("created RX pool %d for size %lld\n", pool,
VLAN_BD_LEN(buf));
break;
}
}
}
/* Still no usable pool? Give up */
if (pool < 0 || dev->rx_pool[pool]->count >= RX_POOL_MAX_BDS) {
return H_RESOURCE;
}
DPRINTF("h_add_llan_buf(): Add buf using pool %i (size %lli, count=%i)\n",
pool, VLAN_BD_LEN(buf), dev->rx_pool[pool]->count);
dev->rx_pool[pool]->bds[dev->rx_pool[pool]->count++] = buf;
return 0;
}
/**
* This is the old way of enqueuing receive buffers: Add it to the rx queue
* page that has been supplied by the guest (which is quite limited in size).
*/
static target_long spapr_vlan_add_rxbuf_to_page(VIOsPAPRVLANDevice *dev,
target_ulong buf)
{
vlan_bd_t bd;
if (dev->rx_bufs >= VLAN_MAX_BUFS) {
return H_RESOURCE;
}
do {
dev->add_buf_ptr += 8;
if (dev->add_buf_ptr >= VLAN_RX_BDS_LEN + VLAN_RX_BDS_OFF) {
dev->add_buf_ptr = VLAN_RX_BDS_OFF;
}
bd = vio_ldq(&dev->sdev, dev->buf_list + dev->add_buf_ptr);
} while (bd & VLAN_BD_VALID);
vio_stq(&dev->sdev, dev->buf_list + dev->add_buf_ptr, buf);
DPRINTF("h_add_llan_buf(): Added buf ptr=%d rx_bufs=%d bd=0x%016llx\n",
dev->add_buf_ptr, dev->rx_bufs, (unsigned long long)buf);
return 0;
}
static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu,
sPAPRMachineState *spapr,
target_ulong opcode,
target_ulong *args)
{
target_ulong reg = args[0];
target_ulong buf = args[1];
VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
target_long ret;
DPRINTF("H_ADD_LOGICAL_LAN_BUFFER(0x" TARGET_FMT_lx
", 0x" TARGET_FMT_lx ")\n", reg, buf);
if (!sdev) {
hcall_dprintf("Bad device\n");
return H_PARAMETER;
}
if ((check_bd(dev, buf, 4) < 0)
|| (VLAN_BD_LEN(buf) < 16)) {
hcall_dprintf("Bad buffer enqueued\n");
return H_PARAMETER;
}
if (!dev->isopen) {
return H_RESOURCE;
}
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
ret = spapr_vlan_add_rxbuf_to_pool(dev, buf);
} else {
ret = spapr_vlan_add_rxbuf_to_page(dev, buf);
}
if (ret) {
return ret;
}
dev->rx_bufs++;
qemu_flush_queued_packets(qemu_get_queue(dev->nic));
return H_SUCCESS;
}
static target_ulong h_send_logical_lan(PowerPCCPU *cpu,
sPAPRMachineState *spapr,
target_ulong opcode, target_ulong *args)
{
target_ulong reg = args[0];
target_ulong *bufs = args + 1;
target_ulong continue_token = args[7];
VIOsPAPRDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
VIOsPAPRVLANDevice *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
unsigned total_len;
uint8_t *lbuf, *p;
int i, nbufs;
int ret;
DPRINTF("H_SEND_LOGICAL_LAN(0x" TARGET_FMT_lx ", <bufs>, 0x"
TARGET_FMT_lx ")\n", reg, continue_token);
if (!sdev) {
return H_PARAMETER;
}
DPRINTF("rxbufs = %d\n", dev->rx_bufs);
if (!dev->isopen) {
return H_DROPPED;
}
if (continue_token) {
return H_HARDWARE; /* FIXME actually handle this */
}
total_len = 0;
for (i = 0; i < 6; i++) {
DPRINTF(" buf desc: 0x" TARGET_FMT_lx "\n", bufs[i]);
if (!(bufs[i] & VLAN_BD_VALID)) {
break;
}
total_len += VLAN_BD_LEN(bufs[i]);
}
nbufs = i;
DPRINTF("h_send_logical_lan() %d buffers, total length 0x%x\n",
nbufs, total_len);
if (total_len == 0) {
return H_SUCCESS;
}
if (total_len > MAX_PACKET_SIZE) {
/* Don't let the guest force too large an allocation */
return H_RESOURCE;
}
lbuf = alloca(total_len);
p = lbuf;
for (i = 0; i < nbufs; i++) {
ret = spapr_vio_dma_read(sdev, VLAN_BD_ADDR(bufs[i]),
p, VLAN_BD_LEN(bufs[i]));
if (ret < 0) {
return ret;
}
p += VLAN_BD_LEN(bufs[i]);
}
qemu_send_packet(qemu_get_queue(dev->nic), lbuf, total_len);
return H_SUCCESS;
}
static target_ulong h_multicast_ctrl(PowerPCCPU *cpu, sPAPRMachineState *spapr,
target_ulong opcode, target_ulong *args)
{
target_ulong reg = args[0];
VIOsPAPRDevice *dev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
if (!dev) {
return H_PARAMETER;
}
return H_SUCCESS;
}
static Property spapr_vlan_properties[] = {
DEFINE_SPAPR_PROPERTIES(VIOsPAPRVLANDevice, sdev),
DEFINE_NIC_PROPERTIES(VIOsPAPRVLANDevice, nicconf),
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
DEFINE_PROP_BIT("use-rx-buffer-pools", VIOsPAPRVLANDevice,
compat_flags, SPAPRVLAN_FLAG_RX_BUF_POOLS_BIT, true),
DEFINE_PROP_END_OF_LIST(),
};
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
static bool spapr_vlan_rx_buffer_pools_needed(void *opaque)
{
VIOsPAPRVLANDevice *dev = opaque;
return (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) != 0;
}
static const VMStateDescription vmstate_rx_buffer_pool = {
.name = "spapr_llan/rx_buffer_pool",
.version_id = 1,
.minimum_version_id = 1,
.needed = spapr_vlan_rx_buffer_pools_needed,
.fields = (VMStateField[]) {
VMSTATE_INT32(bufsize, RxBufPool),
VMSTATE_INT32(count, RxBufPool),
VMSTATE_UINT64_ARRAY(bds, RxBufPool, RX_POOL_MAX_BDS),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_rx_pools = {
.name = "spapr_llan/rx_pools",
.version_id = 1,
.minimum_version_id = 1,
.needed = spapr_vlan_rx_buffer_pools_needed,
.fields = (VMStateField[]) {
VMSTATE_ARRAY_OF_POINTER_TO_STRUCT(rx_pool, VIOsPAPRVLANDevice,
RX_MAX_POOLS, 1,
vmstate_rx_buffer_pool, RxBufPool),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_spapr_llan = {
.name = "spapr_llan",
.version_id = 1,
.minimum_version_id = 1,
.fields = (VMStateField[]) {
VMSTATE_SPAPR_VIO(sdev, VIOsPAPRVLANDevice),
/* LLAN state */
VMSTATE_BOOL(isopen, VIOsPAPRVLANDevice),
VMSTATE_UINT64(buf_list, VIOsPAPRVLANDevice),
VMSTATE_UINT32(add_buf_ptr, VIOsPAPRVLANDevice),
VMSTATE_UINT32(use_buf_ptr, VIOsPAPRVLANDevice),
VMSTATE_UINT32(rx_bufs, VIOsPAPRVLANDevice),
VMSTATE_UINT64(rxq_ptr, VIOsPAPRVLANDevice),
VMSTATE_END_OF_LIST()
},
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
.subsections = (const VMStateDescription * []) {
&vmstate_rx_pools,
NULL
}
};
static void spapr_vlan_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VIOsPAPRDeviceClass *k = VIO_SPAPR_DEVICE_CLASS(klass);
k->realize = spapr_vlan_realize;
k->reset = spapr_vlan_reset;
k->devnode = spapr_vlan_devnode;
k->dt_name = "l-lan";
k->dt_type = "network";
k->dt_compatible = "IBM,l-lan";
k->signal_mask = 0x1;
set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
dc->props = spapr_vlan_properties;
k->rtce_window_size = 0x10000000;
dc->vmsd = &vmstate_spapr_llan;
}
static const TypeInfo spapr_vlan_info = {
.name = TYPE_VIO_SPAPR_VLAN_DEVICE,
.parent = TYPE_VIO_SPAPR_DEVICE,
.instance_size = sizeof(VIOsPAPRVLANDevice),
.class_init = spapr_vlan_class_init,
.instance_init = spapr_vlan_instance_init,
hw/net/spapr_llan: Fix receive buffer handling for better performance tl;dr: This patch introduces an alternate way of handling the receive buffers of the spapr-vlan device, resulting in much better receive performance for the guest. Full story: One of our testers recently discovered that the performance of the spapr-vlan device is very poor compared to other NICs, and that a simple "ping -i 0.2 -s 65507 someip" in the guest can result in more than 50% lost ping packets (especially with older guest kernels < 3.17). After doing some analysis, it was clear that there is a problem with the way we handle the receive buffers in spapr_llan.c: The ibmveth driver of the guest Linux kernel tries to add a lot of buffers into several buffer pools (with 512, 2048 and 65536 byte sizes by default, but it can be changed via the entries in the /sys/devices/vio/1000/pool* directories of the guest). However, the spapr-vlan device of QEMU only tries to squeeze all receive buffer descriptors into one single page which has been supplied by the guest during the H_REGISTER_LOGICAL_LAN call, without taking care of different buffer sizes. This has two bad effects: First, only a very limited number of buffer descriptors is accepted at all. Second, we also hand 64k buffers to the guest even if the 2k buffers would fit better - and this results in dropped packets in the IP layer of the guest since too much skbuf memory is used. Though it seems at a first glance like PAPR says that we should store the receive buffer descriptors in the page that is supplied during the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec declares that "the contents of these descriptors are architecturally opaque, none of these descriptors are manipulated by code above the architected interfaces". That means we don't have to store the RX buffer descriptors in this page, but can also manage the receive buffers at the hypervisor level only. This is now what we are doing here: Introducing proper RX buffer pools which are also sorted by size of the buffers, so we can hand out a buffer with the best fitting size when a packet has been received. To avoid problems with migration from/to older version of QEMU, the old behavior is also retained and enabled by default. The new buffer management has to be enabled via a new "use-rx-buffer-pools" property. Now with the new buffer pool management enabled, the problem with "ping -s 65507" is fixed for me, and the throughput of a simple test with wget increases from creeping 3MB/s up to 20MB/s! Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-22 00:25:23 +08:00
.instance_finalize = spapr_vlan_instance_finalize,
};
static void spapr_vlan_register_types(void)
{
spapr_register_hypercall(H_REGISTER_LOGICAL_LAN, h_register_logical_lan);
spapr_register_hypercall(H_FREE_LOGICAL_LAN, h_free_logical_lan);
spapr_register_hypercall(H_SEND_LOGICAL_LAN, h_send_logical_lan);
spapr_register_hypercall(H_ADD_LOGICAL_LAN_BUFFER,
h_add_logical_lan_buffer);
spapr_register_hypercall(H_MULTICAST_CTRL, h_multicast_ctrl);
type_register_static(&spapr_vlan_info);
}
type_init(spapr_vlan_register_types)