From 814cd3ac37be8e71c8ef76234d0da0bbfb2f2fb2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 12 Feb 2012 15:11:53 +0200 Subject: [PATCH 01/12] e1000: move reset function earlier in file Make it easier to reuse this function. Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 80 +++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/hw/e1000.c b/hw/e1000.c index 9c764624f9..dd067680c9 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -152,6 +152,29 @@ static const char phy_regcap[0x20] = { [PHY_ID2] = PHY_R, [M88E1000_PHY_SPEC_STATUS] = PHY_R }; +static const uint16_t phy_reg_init[] = { + [PHY_CTRL] = 0x1140, [PHY_STATUS] = 0x796d, // link initially up + [PHY_ID1] = 0x141, [PHY_ID2] = PHY_ID2_INIT, + [PHY_1000T_CTRL] = 0x0e00, [M88E1000_PHY_SPEC_CTRL] = 0x360, + [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60, [PHY_AUTONEG_ADV] = 0xde1, + [PHY_LP_ABILITY] = 0x1e0, [PHY_1000T_STATUS] = 0x3c00, + [M88E1000_PHY_SPEC_STATUS] = 0xac00, +}; + +static const uint32_t mac_reg_init[] = { + [PBA] = 0x00100030, + [LEDCTL] = 0x602, + [CTRL] = E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 | + E1000_CTRL_SPD_1000 | E1000_CTRL_SLU, + [STATUS] = 0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE | + E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK | + E1000_STATUS_SPEED_1000 | E1000_STATUS_FD | + E1000_STATUS_LU, + [MANC] = E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN | + E1000_MANC_ARP_EN | E1000_MANC_0298_EN | + E1000_MANC_RMCP_EN, +}; + static void set_interrupt_cause(E1000State *s, int index, uint32_t val) { @@ -193,6 +216,23 @@ rxbufsize(uint32_t v) return 2048; } +static void e1000_reset(void *opaque) +{ + E1000State *d = opaque; + + memset(d->phy_reg, 0, sizeof d->phy_reg); + memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); + memset(d->mac_reg, 0, sizeof d->mac_reg); + memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init); + d->rxbuf_min_shift = 1; + memset(&d->tx, 0, sizeof d->tx); + + if (d->nic->nc.link_down) { + d->mac_reg[STATUS] &= ~E1000_STATUS_LU; + d->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS; + } +} + static void set_ctrl(E1000State *s, int index, uint32_t val) { @@ -1061,29 +1101,6 @@ static const uint16_t e1000_eeprom_template[64] = { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, }; -static const uint16_t phy_reg_init[] = { - [PHY_CTRL] = 0x1140, [PHY_STATUS] = 0x796d, // link initially up - [PHY_ID1] = 0x141, [PHY_ID2] = PHY_ID2_INIT, - [PHY_1000T_CTRL] = 0x0e00, [M88E1000_PHY_SPEC_CTRL] = 0x360, - [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60, [PHY_AUTONEG_ADV] = 0xde1, - [PHY_LP_ABILITY] = 0x1e0, [PHY_1000T_STATUS] = 0x3c00, - [M88E1000_PHY_SPEC_STATUS] = 0xac00, -}; - -static const uint32_t mac_reg_init[] = { - [PBA] = 0x00100030, - [LEDCTL] = 0x602, - [CTRL] = E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 | - E1000_CTRL_SPD_1000 | E1000_CTRL_SLU, - [STATUS] = 0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE | - E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK | - E1000_STATUS_SPEED_1000 | E1000_STATUS_FD | - E1000_STATUS_LU, - [MANC] = E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN | - E1000_MANC_ARP_EN | E1000_MANC_0298_EN | - E1000_MANC_RMCP_EN, -}; - /* PCI interface */ static void @@ -1123,23 +1140,6 @@ pci_e1000_uninit(PCIDevice *dev) return 0; } -static void e1000_reset(void *opaque) -{ - E1000State *d = opaque; - - memset(d->phy_reg, 0, sizeof d->phy_reg); - memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); - memset(d->mac_reg, 0, sizeof d->mac_reg); - memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init); - d->rxbuf_min_shift = 1; - memset(&d->tx, 0, sizeof d->tx); - - if (d->nic->nc.link_down) { - d->mac_reg[STATUS] &= ~E1000_STATUS_LU; - d->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS; - } -} - static NetClientInfo net_e1000_info = { .type = NET_CLIENT_TYPE_NIC, .size = sizeof(NICState), From a281ebc11a6917fbc27e1a93bb5772cd14e241fc Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 22 Apr 2012 16:45:53 +0300 Subject: [PATCH 02/12] virtio: add missing mb() on notification During normal operation, virtio first writes a used index and then checks whether it should interrupt the guest by reading guest avail index/flag values. Guest does the reverse: writes the index/flag, then checks the used ring. The ordering is important: if host avail flag read bypasses the used index write, we could in effect get this timing: host avail flag read guest enable interrupts: avail flag write guest check used ring: ring is empty host used index write which results in a lost interrupt: guest will never be notified about the used ring update. This actually can happen when using kvm with an io thread, such that the guest vcpu and qemu run on different host cpus, and this has actually been observed in the field (but only seems to trigger on very specific processor types) with userspace virtio: vhost has the necessary smp_mb() in place to prevent the regordering, so the same workload stalls forever waiting for an interrupt with vhost=off but works fine with vhost=on. Insert an smp_mb barrier operation in userspace virtio to ensure the correct ordering. Applying this patch fixed the race condition we have observed. Tested on x86_64. I checked the code generated by the new macro for i386 and ppc but didn't run virtio. Note: mb could in theory be implemented by __sync_synchronize, but this would make us hit old GCC bugs. Besides old GCC not implementing __sync_synchronize at all, there were bugs http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36793 in this functionality as recently as in 4.3. As we need asm for rmb,wmb anyway, it's just as well to use it for mb. Signed-off-by: Michael S. Tsirkin --- hw/virtio.c | 2 ++ qemu-barrier.h | 23 ++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 314abf8a18..bb994c9b51 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -700,6 +700,8 @@ static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq) { uint16_t old, new; bool v; + /* We need to expose used array entries before checking used event. */ + smp_mb(); /* Always notify when queue is empty (when feature acknowledge) */ if (((vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) && !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx)) { diff --git a/qemu-barrier.h b/qemu-barrier.h index c11bb2b59f..f0b842e5b2 100644 --- a/qemu-barrier.h +++ b/qemu-barrier.h @@ -4,7 +4,7 @@ /* Compiler barrier */ #define barrier() asm volatile("" ::: "memory") -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) /* * Because of the strongly ordered x86 storage model, wmb() is a nop @@ -13,15 +13,31 @@ * load/stores from C code. */ #define smp_wmb() barrier() +/* + * We use GCC builtin if it's available, as that can use + * mfence on 32 bit as well, e.g. if built with -march=pentium-m. + * However, on i386, there seem to be known bugs as recently as 4.3. + * */ +#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4 +#define smp_mb() __sync_synchronize() +#else +#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") +#endif + +#elif defined(__x86_64__) + +#define smp_wmb() barrier() +#define smp_mb() asm volatile("mfence" ::: "memory") #elif defined(_ARCH_PPC) /* - * We use an eieio() for a wmb() on powerpc. This assumes we don't + * We use an eieio() for wmb() on powerpc. This assumes we don't * need to order cacheable and non-cacheable stores with respect to * each other */ #define smp_wmb() asm volatile("eieio" ::: "memory") +#define smp_mb() asm volatile("sync" ::: "memory") #else @@ -29,9 +45,10 @@ * For (host) platforms we don't have explicit barrier definitions * for, we use the gcc __sync_synchronize() primitive to generate a * full barrier. This should be safe on all platforms, though it may - * be overkill. + * be overkill for wmb(). */ #define smp_wmb() __sync_synchronize() +#define smp_mb() __sync_synchronize() #endif From 92045d80badc43c9f95897aad675dc7ef17a3b3f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 23 Apr 2012 14:11:14 +0300 Subject: [PATCH 03/12] virtio: add missing mb() on enable notification This fixes an issue dual to the one fixed by patch 'virtio: add missing mb() on notification' and applies on top. In this case, to enable vq kick to exit to host, qemu writes out used flag then reads the avail index. if these are reordered we get a race: host avail index read: ring is empty guest avail index write guest flag read: exit disabled host used flag write: enable exit which results in a lost exit: host will never be notified about the avail index update. Again, happens in the field but only seems to trigger on some specific hardware. Insert an smp_mb barrier operation to ensure the correct ordering. Signed-off-by: Michael S. Tsirkin --- hw/virtio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/virtio.c b/hw/virtio.c index bb994c9b51..5615b59a6c 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -209,6 +209,10 @@ void virtio_queue_set_notification(VirtQueue *vq, int enable) } else { vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); } + if (enable) { + /* Expose avail event/used flags before caller checks the avail idx. */ + smp_mb(); + } } int virtio_queue_ready(VirtQueue *vq) From a821ce59338c79bb72dc844dd44ea53701965b2b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 23 Apr 2012 15:46:22 +0300 Subject: [PATCH 04/12] virtio: order index/descriptor reads virtio has the equivalent of: if (vq->last_avail_index != vring_avail_idx(vq)) { read descriptor head at vq->last_avail_index; } In theory, processor can reorder descriptor head read to happen speculatively before the index read. this would trigger the following race: host descriptor head read <- reads invalid head from ring guest writes valid descriptor head guest writes avail index host avail index read <- observes valid index as a result host will use an invalid head value. This was not observed in the field by me but after the experience with the previous two races I think it is prudent to address this theoretical race condition. Signed-off-by: Michael S. Tsirkin --- hw/virtio.c | 5 +++++ qemu-barrier.h | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 5615b59a6c..168abe4864 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -287,6 +287,11 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) idx, vring_avail_idx(vq)); exit(1); } + /* On success, callers read a descriptor at vq->last_avail_idx. + * Make sure descriptor read does not bypass avail index read. */ + if (num_heads) { + smp_rmb(); + } return num_heads; } diff --git a/qemu-barrier.h b/qemu-barrier.h index f0b842e5b2..7e11197814 100644 --- a/qemu-barrier.h +++ b/qemu-barrier.h @@ -7,12 +7,13 @@ #if defined(__i386__) /* - * Because of the strongly ordered x86 storage model, wmb() is a nop + * Because of the strongly ordered x86 storage model, wmb() and rmb() are nops * on x86(well, a compiler barrier only). Well, at least as long as * qemu doesn't do accesses to write-combining memory or non-temporal * load/stores from C code. */ #define smp_wmb() barrier() +#define smp_rmb() barrier() /* * We use GCC builtin if it's available, as that can use * mfence on 32 bit as well, e.g. if built with -march=pentium-m. @@ -27,6 +28,7 @@ #elif defined(__x86_64__) #define smp_wmb() barrier() +#define smp_rmb() barrier() #define smp_mb() asm volatile("mfence" ::: "memory") #elif defined(_ARCH_PPC) @@ -37,6 +39,13 @@ * each other */ #define smp_wmb() asm volatile("eieio" ::: "memory") + +#if defined(__powerpc64__) +#define smp_rmb() asm volatile("lwsync" ::: "memory") +#else +#define smp_rmb() asm volatile("sync" ::: "memory") +#endif + #define smp_mb() asm volatile("sync" ::: "memory") #else @@ -45,10 +54,11 @@ * For (host) platforms we don't have explicit barrier definitions * for, we use the gcc __sync_synchronize() primitive to generate a * full barrier. This should be safe on all platforms, though it may - * be overkill for wmb(). + * be overkill for wmb() and rmb(). */ #define smp_wmb() __sync_synchronize() #define smp_mb() __sync_synchronize() +#define smp_rmb() __sync_synchronize() #endif From 69f3ce78cc0b6941fc76dcbe2118269a16ddf043 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 10 Apr 2012 20:48:54 +0200 Subject: [PATCH 05/12] eepro100: Fix multicast regression Commit 7fc8d918b9674c3e9233d6d25da2457345d414a0 removed code from eepro100.c and replaced it by different code: the code in net.c returns bits 31...26, but eepro100 needs bits 7...2. This patch partially reverts 7fc8d918b9674c3e9233d6d25da2457345d414a0. To avoid future problems, I renamed the function and changed the comment. Signed-off-by: Stefan Weil Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/eepro100.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/hw/eepro100.c b/hw/eepro100.c index 02e6f7ef31..6279ae36ec 100644 --- a/hw/eepro100.c +++ b/hw/eepro100.c @@ -322,8 +322,32 @@ static const uint16_t eepro100_mdi_mask[] = { 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, }; +#define POLYNOMIAL 0x04c11db6 + static E100PCIDeviceInfo *eepro100_get_class(EEPRO100State *s); +/* From FreeBSD (locally modified). */ +static unsigned e100_compute_mcast_idx(const uint8_t *ep) +{ + uint32_t crc; + int carry, i, j; + uint8_t b; + + crc = 0xffffffff; + for (i = 0; i < 6; i++) { + b = *ep++; + for (j = 0; j < 8; j++) { + carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01); + crc <<= 1; + b >>= 1; + if (carry) { + crc = ((crc ^ POLYNOMIAL) | carry); + } + } + } + return (crc & BITS(7, 2)) >> 2; +} + /* Read a 16 bit control/status (CSR) register. */ static uint16_t e100_read_reg2(EEPRO100State *s, E100RegisterOffset addr) { @@ -823,7 +847,7 @@ static void set_multicast_list(EEPRO100State *s) uint8_t multicast_addr[6]; pci_dma_read(&s->dev, s->cb_address + 10 + i, multicast_addr, 6); TRACE(OTHER, logout("multicast entry %s\n", nic_dump(multicast_addr, 6))); - unsigned mcast_idx = compute_mcast_idx(multicast_addr); + unsigned mcast_idx = e100_compute_mcast_idx(multicast_addr); assert(mcast_idx < 64); s->mult[mcast_idx >> 3] |= (1 << (mcast_idx & 7)); } @@ -1650,7 +1674,7 @@ static ssize_t nic_receive(VLANClientState *nc, const uint8_t * buf, size_t size if (s->configuration[21] & BIT(3)) { /* Multicast all bit is set, receive all multicast frames. */ } else { - unsigned mcast_idx = compute_mcast_idx(buf); + unsigned mcast_idx = e100_compute_mcast_idx(buf); assert(mcast_idx < 64); if (s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))) { /* Multicast frame is allowed in hash table. */ From 2e54cc21ad60219229df569a857a453285795565 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 22 Mar 2012 18:01:42 +0800 Subject: [PATCH 06/12] e1000: introduce bits of PHY control register This would be used be following patches. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000_hw.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/e1000_hw.h b/hw/e1000_hw.h index 9e29af8c82..c9cb79e64d 100644 --- a/hw/e1000_hw.h +++ b/hw/e1000_hw.h @@ -349,6 +349,18 @@ #define M88E1000_PHY_VCO_REG_BIT8 0x100 /* Bits 8 & 11 are adjusted for */ #define M88E1000_PHY_VCO_REG_BIT11 0x800 /* improved BER performance */ +/* PHY Control Register */ +#define MII_CR_SPEED_SELECT_MSB 0x0040 /* bits 6,13: 10=1000, 01=100, 00=10 */ +#define MII_CR_COLL_TEST_ENABLE 0x0080 /* Collision test enable */ +#define MII_CR_FULL_DUPLEX 0x0100 /* FDX =1, half duplex =0 */ +#define MII_CR_RESTART_AUTO_NEG 0x0200 /* Restart auto negotiation */ +#define MII_CR_ISOLATE 0x0400 /* Isolate PHY from MII */ +#define MII_CR_POWER_DOWN 0x0800 /* Power down */ +#define MII_CR_AUTO_NEG_EN 0x1000 /* Auto Neg Enable */ +#define MII_CR_SPEED_SELECT_LSB 0x2000 /* bits 6,13: 10=1000, 01=100, 00=10 */ +#define MII_CR_LOOPBACK 0x4000 /* 0 = normal, 1 = loopback */ +#define MII_CR_RESET 0x8000 /* 0 = normal, 1 = PHY reset */ + /* PHY Status Register */ #define MII_SR_EXTENDED_CAPS 0x0001 /* Extended register capabilities */ #define MII_SR_JABBER_DETECT 0x0002 /* Jabber Detected */ From 17fbbb0b3d0b09f07d288991248c81f441bf6941 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 22 Mar 2012 18:01:50 +0800 Subject: [PATCH 07/12] e1000: conditionally raise irq at the end of MDI cycle According to the spec: "When set to 1b by software, it causes an Interrupt to be asserted to indicate the end of an MDI cycle." We need check the Interrupt Enable bit and raise irq only when it is set. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/e1000.c b/hw/e1000.c index dd067680c9..4e787bc2cf 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -274,7 +274,10 @@ set_mdic(E1000State *s, int index, uint32_t val) s->phy_reg[addr] = data; } s->mac_reg[MDIC] = val | E1000_MDIC_READY; - set_ics(s, 0, E1000_ICR_MDAC); + + if (val & E1000_MDIC_INT_EN) { + set_ics(s, 0, E1000_ICR_MDAC); + } } static uint32_t From 93e37d769074bebfd04c6704b7a7ae92736876e3 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 22 Mar 2012 18:01:59 +0800 Subject: [PATCH 08/12] e1000: PHY loopback mode support The missing of loopback mode prevent the running of self diagnosis program in guest. This patch adds this support. After this patch, loopback test of ethtool were passed in guest. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/e1000.c b/hw/e1000.c index 4e787bc2cf..4a09e39975 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -391,6 +391,16 @@ fcs_len(E1000State *s) return (s->mac_reg[RCTL] & E1000_RCTL_SECRC) ? 0 : 4; } +static void +e1000_send_packet(E1000State *s, const uint8_t *buf, int size) +{ + if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) { + s->nic->nc.info->receive(&s->nic->nc, buf, size); + } else { + qemu_send_packet(&s->nic->nc, buf, size); + } +} + static void xmit_seg(E1000State *s) { @@ -440,9 +450,9 @@ xmit_seg(E1000State *s) memmove(tp->vlan, tp->data, 4); memmove(tp->data, tp->data + 4, 8); memcpy(tp->data + 8, tp->vlan_header, 4); - qemu_send_packet(&s->nic->nc, tp->vlan, tp->size + 4); + e1000_send_packet(s, tp->vlan, tp->size + 4); } else - qemu_send_packet(&s->nic->nc, tp->data, tp->size); + e1000_send_packet(s, tp->data, tp->size); s->mac_reg[TPT]++; s->mac_reg[GPTC]++; n = s->mac_reg[TOTL]; From 71aadd3cd2b2f8bd3113a08a12d4a06d48b6c39c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 22 Mar 2012 18:02:07 +0800 Subject: [PATCH 09/12] e1000: introduce helpers to manipulate link status This patch introduces helpers to change link status bit for phy/mac register. This would help to reduce code duplication and would be used by following patches. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/hw/e1000.c b/hw/e1000.c index 4a09e39975..6abec6083b 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -142,6 +142,20 @@ enum { defreg(VET), }; +static void +e1000_link_down(E1000State *s) +{ + s->mac_reg[STATUS] &= ~E1000_STATUS_LU; + s->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS; +} + +static void +e1000_link_up(E1000State *s) +{ + s->mac_reg[STATUS] |= E1000_STATUS_LU; + s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS; +} + enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W }; static const char phy_regcap[0x20] = { [PHY_STATUS] = PHY_R, [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW, @@ -228,8 +242,7 @@ static void e1000_reset(void *opaque) memset(&d->tx, 0, sizeof d->tx); if (d->nic->nc.link_down) { - d->mac_reg[STATUS] &= ~E1000_STATUS_LU; - d->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS; + e1000_link_down(d); } } @@ -675,11 +688,9 @@ e1000_set_link_status(VLANClientState *nc) uint32_t old_status = s->mac_reg[STATUS]; if (nc->link_down) { - s->mac_reg[STATUS] &= ~E1000_STATUS_LU; - s->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS; + e1000_link_down(s); } else { - s->mac_reg[STATUS] |= E1000_STATUS_LU; - s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS; + e1000_link_up(s); } if (s->mac_reg[STATUS] != old_status) From f9c1cdf4928e6c036a6373e683d1774129a10311 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 22 Mar 2012 18:02:16 +0800 Subject: [PATCH 10/12] e1000: introduce bit for debugging PHY emulation Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/e1000.c b/hw/e1000.c index 6abec6083b..5f0c2f6d30 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -42,7 +42,7 @@ enum { DEBUG_GENERAL, DEBUG_IO, DEBUG_MMIO, DEBUG_INTERRUPT, DEBUG_RX, DEBUG_TX, DEBUG_MDIC, DEBUG_EEPROM, DEBUG_UNKNOWN, DEBUG_TXSUM, DEBUG_TXERR, DEBUG_RXERR, - DEBUG_RXFILTER, DEBUG_NOTYET, + DEBUG_RXFILTER, DEBUG_PHY, DEBUG_NOTYET, }; #define DBGBIT(x) (1< Date: Thu, 22 Mar 2012 18:02:24 +0800 Subject: [PATCH 11/12] e1000: link auto-negotiation emulation Indeed, there's nothing else except for the time spent on the negotiation needs to be emulated. This is needed for resuming windows guest from hibernation, as without a proper delay, qemu would send the packet too early ( guest even does not have a proper intr handler), which could lead windows guest hang. This patch first introduces an array of function pointers to make it possible to emulate per-register write behavior. Then traps the PHY_CTRL register write and when guest want to restart the link auto negotiation, we would down the link and mark the auto negotiation in progress in PHY_STATUS register. After time, a timer with 500 ms ( which is the minimum timeout of auto-negotation specified in 802.3 spec). The link would be up when timer expired. Test with resuming windows guest plus flood ping and linux ethtool linkstatus test. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/hw/e1000.c b/hw/e1000.c index 5f0c2f6d30..b5c26e0dd8 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -125,6 +125,8 @@ typedef struct E1000State_st { uint16_t reading; uint32_t old_eecd; } eecd_state; + + QEMUTimer *autoneg_timer; } E1000State; #define defreg(x) x = (E1000_##x>>2) @@ -156,6 +158,34 @@ e1000_link_up(E1000State *s) s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS; } +static void +set_phy_ctrl(E1000State *s, int index, uint16_t val) +{ + if ((val & MII_CR_AUTO_NEG_EN) && (val & MII_CR_RESTART_AUTO_NEG)) { + s->nic->nc.link_down = true; + e1000_link_down(s); + s->phy_reg[PHY_STATUS] &= ~MII_SR_AUTONEG_COMPLETE; + DBGOUT(PHY, "Start link auto negotiation\n"); + qemu_mod_timer(s->autoneg_timer, qemu_get_clock_ms(vm_clock) + 500); + } +} + +static void +e1000_autoneg_timer(void *opaque) +{ + E1000State *s = opaque; + s->nic->nc.link_down = false; + e1000_link_up(s); + s->phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE; + DBGOUT(PHY, "Auto negotiation is completed\n"); +} + +static void (*phyreg_writeops[])(E1000State *, int, uint16_t) = { + [PHY_CTRL] = set_phy_ctrl, +}; + +enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) }; + enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W }; static const char phy_regcap[0x20] = { [PHY_STATUS] = PHY_R, [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW, @@ -167,7 +197,8 @@ static const char phy_regcap[0x20] = { }; static const uint16_t phy_reg_init[] = { - [PHY_CTRL] = 0x1140, [PHY_STATUS] = 0x796d, // link initially up + [PHY_CTRL] = 0x1140, + [PHY_STATUS] = 0x794d, /* link initially up with not completed autoneg */ [PHY_ID1] = 0x141, [PHY_ID2] = PHY_ID2_INIT, [PHY_1000T_CTRL] = 0x0e00, [M88E1000_PHY_SPEC_CTRL] = 0x360, [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60, [PHY_AUTONEG_ADV] = 0xde1, @@ -234,6 +265,7 @@ static void e1000_reset(void *opaque) { E1000State *d = opaque; + qemu_del_timer(d->autoneg_timer); memset(d->phy_reg, 0, sizeof d->phy_reg); memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init); memset(d->mac_reg, 0, sizeof d->mac_reg); @@ -283,8 +315,12 @@ set_mdic(E1000State *s, int index, uint32_t val) if (!(phy_regcap[addr] & PHY_W)) { DBGOUT(MDIC, "MDIC write reg %x unhandled\n", addr); val |= E1000_MDIC_ERROR; - } else + } else { + if (addr < NPHYWRITEOPS && phyreg_writeops[addr]) { + phyreg_writeops[addr](s, index, data); + } s->phy_reg[addr] = data; + } } s->mac_reg[MDIC] = val | E1000_MDIC_READY; @@ -965,6 +1001,7 @@ static void (*macreg_writeops[])(E1000State *, int, uint32_t) = { [MTA ... MTA+127] = &mac_writereg, [VFTA ... VFTA+127] = &mac_writereg, }; + enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) }; static void @@ -1158,6 +1195,8 @@ pci_e1000_uninit(PCIDevice *dev) { E1000State *d = DO_UPCAST(E1000State, dev, dev); + qemu_del_timer(d->autoneg_timer); + qemu_free_timer(d->autoneg_timer); memory_region_destroy(&d->mmio); memory_region_destroy(&d->io); qemu_del_vlan_client(&d->nic->nc); @@ -1212,6 +1251,8 @@ static int pci_e1000_init(PCIDevice *pci_dev) add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0"); + d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d); + return 0; } From f1219091edd20e3b92544025c2b6dd5e4d98b61b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 22 Mar 2012 18:02:34 +0800 Subject: [PATCH 12/12] e1000: set E1000_ICR_INT_ASSERTED only for 8257x E1000_ICR_INT_ASSERTED were introduced only for 8257x, so we need to check the E1000_DEVID before setting this bit in ICS. Signed-off-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- hw/e1000.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/e1000.c b/hw/e1000.c index b5c26e0dd8..4573f1301e 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -223,8 +223,10 @@ static const uint32_t mac_reg_init[] = { static void set_interrupt_cause(E1000State *s, int index, uint32_t val) { - if (val) + if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) { + /* Only for 8257x */ val |= E1000_ICR_INT_ASSERTED; + } s->mac_reg[ICR] = val; s->mac_reg[ICS] = val; qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);