From 814cd3ac37be8e71c8ef76234d0da0bbfb2f2fb2 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 12 Feb 2012 15:11:53 +0200
Subject: [PATCH 01/12] e1000: move reset function earlier in file

Make it easier to reuse this function.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 80 +++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index 9c764624f9..dd067680c9 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -152,6 +152,29 @@ static const char phy_regcap[0x20] = {
     [PHY_ID2] = PHY_R,		[M88E1000_PHY_SPEC_STATUS] = PHY_R
 };
 
+static const uint16_t phy_reg_init[] = {
+    [PHY_CTRL] = 0x1140,			[PHY_STATUS] = 0x796d, // link initially up
+    [PHY_ID1] = 0x141,				[PHY_ID2] = PHY_ID2_INIT,
+    [PHY_1000T_CTRL] = 0x0e00,			[M88E1000_PHY_SPEC_CTRL] = 0x360,
+    [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60,	[PHY_AUTONEG_ADV] = 0xde1,
+    [PHY_LP_ABILITY] = 0x1e0,			[PHY_1000T_STATUS] = 0x3c00,
+    [M88E1000_PHY_SPEC_STATUS] = 0xac00,
+};
+
+static const uint32_t mac_reg_init[] = {
+    [PBA] =     0x00100030,
+    [LEDCTL] =  0x602,
+    [CTRL] =    E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 |
+                E1000_CTRL_SPD_1000 | E1000_CTRL_SLU,
+    [STATUS] =  0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE |
+                E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK |
+                E1000_STATUS_SPEED_1000 | E1000_STATUS_FD |
+                E1000_STATUS_LU,
+    [MANC] =    E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN |
+                E1000_MANC_ARP_EN | E1000_MANC_0298_EN |
+                E1000_MANC_RMCP_EN,
+};
+
 static void
 set_interrupt_cause(E1000State *s, int index, uint32_t val)
 {
@@ -193,6 +216,23 @@ rxbufsize(uint32_t v)
     return 2048;
 }
 
+static void e1000_reset(void *opaque)
+{
+    E1000State *d = opaque;
+
+    memset(d->phy_reg, 0, sizeof d->phy_reg);
+    memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
+    memset(d->mac_reg, 0, sizeof d->mac_reg);
+    memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init);
+    d->rxbuf_min_shift = 1;
+    memset(&d->tx, 0, sizeof d->tx);
+
+    if (d->nic->nc.link_down) {
+        d->mac_reg[STATUS] &= ~E1000_STATUS_LU;
+        d->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
+    }
+}
+
 static void
 set_ctrl(E1000State *s, int index, uint32_t val)
 {
@@ -1061,29 +1101,6 @@ static const uint16_t e1000_eeprom_template[64] = {
     0xffff, 0xffff, 0xffff, 0xffff,      0xffff, 0xffff,      0xffff, 0x0000,
 };
 
-static const uint16_t phy_reg_init[] = {
-    [PHY_CTRL] = 0x1140,			[PHY_STATUS] = 0x796d, // link initially up
-    [PHY_ID1] = 0x141,				[PHY_ID2] = PHY_ID2_INIT,
-    [PHY_1000T_CTRL] = 0x0e00,			[M88E1000_PHY_SPEC_CTRL] = 0x360,
-    [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60,	[PHY_AUTONEG_ADV] = 0xde1,
-    [PHY_LP_ABILITY] = 0x1e0,			[PHY_1000T_STATUS] = 0x3c00,
-    [M88E1000_PHY_SPEC_STATUS] = 0xac00,
-};
-
-static const uint32_t mac_reg_init[] = {
-    [PBA] =     0x00100030,
-    [LEDCTL] =  0x602,
-    [CTRL] =    E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 |
-                E1000_CTRL_SPD_1000 | E1000_CTRL_SLU,
-    [STATUS] =  0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE |
-                E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK |
-                E1000_STATUS_SPEED_1000 | E1000_STATUS_FD |
-                E1000_STATUS_LU,
-    [MANC] =    E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN |
-                E1000_MANC_ARP_EN | E1000_MANC_0298_EN |
-                E1000_MANC_RMCP_EN,
-};
-
 /* PCI interface */
 
 static void
@@ -1123,23 +1140,6 @@ pci_e1000_uninit(PCIDevice *dev)
     return 0;
 }
 
-static void e1000_reset(void *opaque)
-{
-    E1000State *d = opaque;
-
-    memset(d->phy_reg, 0, sizeof d->phy_reg);
-    memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
-    memset(d->mac_reg, 0, sizeof d->mac_reg);
-    memmove(d->mac_reg, mac_reg_init, sizeof mac_reg_init);
-    d->rxbuf_min_shift = 1;
-    memset(&d->tx, 0, sizeof d->tx);
-
-    if (d->nic->nc.link_down) {
-        d->mac_reg[STATUS] &= ~E1000_STATUS_LU;
-        d->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
-    }
-}
-
 static NetClientInfo net_e1000_info = {
     .type = NET_CLIENT_TYPE_NIC,
     .size = sizeof(NICState),

From a281ebc11a6917fbc27e1a93bb5772cd14e241fc Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 22 Apr 2012 16:45:53 +0300
Subject: [PATCH 02/12] virtio: add missing mb() on notification

During normal operation, virtio first writes a used index
and then checks whether it should interrupt the guest
by reading guest avail index/flag values.

Guest does the reverse: writes the index/flag,
then checks the used ring.

The ordering is important: if host avail flag read bypasses the used
index write, we could in effect get this timing:

host avail flag read
		guest enable interrupts: avail flag write
		guest check used ring: ring is empty
host used index write

which results in a lost interrupt: guest will never be notified
about the used ring update.

This actually can happen when using kvm with an io thread,
such that the guest vcpu and qemu run on different host cpus,
and this has actually been observed in the field
(but only seems to trigger on very specific processor types)
with userspace virtio: vhost has the necessary smp_mb()
in place to prevent the regordering, so the same workload stalls
forever waiting for an interrupt with vhost=off but works
fine with vhost=on.

Insert an smp_mb barrier operation in userspace virtio to
ensure the correct ordering.
Applying this patch fixed the race condition we have observed.
Tested on x86_64. I checked the code generated by the new macro
for i386 and ppc but didn't run virtio.

Note: mb could in theory be implemented by __sync_synchronize, but this
would make us hit old GCC bugs. Besides old GCC
not implementing __sync_synchronize at all, there were bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36793
in this functionality as recently as in 4.3.

As we need asm for rmb,wmb anyway, it's just as well to
use it for mb.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c    |  2 ++
 qemu-barrier.h | 23 ++++++++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 314abf8a18..bb994c9b51 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -700,6 +700,8 @@ static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
     uint16_t old, new;
     bool v;
+    /* We need to expose used array entries before checking used event. */
+    smp_mb();
     /* Always notify when queue is empty (when feature acknowledge) */
     if (((vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) &&
          !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx)) {
diff --git a/qemu-barrier.h b/qemu-barrier.h
index c11bb2b59f..f0b842e5b2 100644
--- a/qemu-barrier.h
+++ b/qemu-barrier.h
@@ -4,7 +4,7 @@
 /* Compiler barrier */
 #define barrier()   asm volatile("" ::: "memory")
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
 
 /*
  * Because of the strongly ordered x86 storage model, wmb() is a nop
@@ -13,15 +13,31 @@
  * load/stores from C code.
  */
 #define smp_wmb()   barrier()
+/*
+ * We use GCC builtin if it's available, as that can use
+ * mfence on 32 bit as well, e.g. if built with -march=pentium-m.
+ * However, on i386, there seem to be known bugs as recently as 4.3.
+ * */
+#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#define smp_mb() __sync_synchronize()
+#else
+#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+#endif
+
+#elif defined(__x86_64__)
+
+#define smp_wmb()   barrier()
+#define smp_mb() asm volatile("mfence" ::: "memory")
 
 #elif defined(_ARCH_PPC)
 
 /*
- * We use an eieio() for a wmb() on powerpc.  This assumes we don't
+ * We use an eieio() for wmb() on powerpc.  This assumes we don't
  * need to order cacheable and non-cacheable stores with respect to
  * each other
  */
 #define smp_wmb()   asm volatile("eieio" ::: "memory")
+#define smp_mb()   asm volatile("sync" ::: "memory")
 
 #else
 
@@ -29,9 +45,10 @@
  * For (host) platforms we don't have explicit barrier definitions
  * for, we use the gcc __sync_synchronize() primitive to generate a
  * full barrier.  This should be safe on all platforms, though it may
- * be overkill.
+ * be overkill for wmb().
  */
 #define smp_wmb()   __sync_synchronize()
+#define smp_mb()   __sync_synchronize()
 
 #endif
 

From 92045d80badc43c9f95897aad675dc7ef17a3b3f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 23 Apr 2012 14:11:14 +0300
Subject: [PATCH 03/12] virtio: add missing mb() on enable notification

This fixes an issue dual to the one fixed by
patch 'virtio: add missing mb() on notification'
and applies on top.

In this case, to enable vq kick to exit to host,
qemu writes out used flag then reads the
avail index. if these are reordered we get a race:

    host avail index read: ring is empty
    		guest avail index write
    		guest flag read: exit disabled
    host used flag write: enable exit

which results in a lost exit: host will never be notified about the
avail index update.  Again, happens in the field but only seems to
trigger on some specific hardware.

Insert an smp_mb barrier operation to ensure the correct ordering.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hw/virtio.c b/hw/virtio.c
index bb994c9b51..5615b59a6c 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -209,6 +209,10 @@ void virtio_queue_set_notification(VirtQueue *vq, int enable)
     } else {
         vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY);
     }
+    if (enable) {
+        /* Expose avail event/used flags before caller checks the avail idx. */
+        smp_mb();
+    }
 }
 
 int virtio_queue_ready(VirtQueue *vq)

From a821ce59338c79bb72dc844dd44ea53701965b2b Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 23 Apr 2012 15:46:22 +0300
Subject: [PATCH 04/12] virtio: order index/descriptor reads

virtio has the equivalent of:

	if (vq->last_avail_index != vring_avail_idx(vq)) {
		read descriptor head at vq->last_avail_index;
	}

In theory, processor can reorder descriptor head
read to happen speculatively before the index read.
this would trigger the following race:

	host descriptor head read <- reads invalid head from ring
		guest writes valid descriptor head
		guest writes avail index
	host avail index read <- observes valid index

as a result host will use an invalid head value.
This was not observed in the field by me but after
the experience with the previous two races
I think it is prudent to address this theoretical race condition.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c    |  5 +++++
 qemu-barrier.h | 14 ++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 5615b59a6c..168abe4864 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -287,6 +287,11 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
                      idx, vring_avail_idx(vq));
         exit(1);
     }
+    /* On success, callers read a descriptor at vq->last_avail_idx.
+     * Make sure descriptor read does not bypass avail index read. */
+    if (num_heads) {
+        smp_rmb();
+    }
 
     return num_heads;
 }
diff --git a/qemu-barrier.h b/qemu-barrier.h
index f0b842e5b2..7e11197814 100644
--- a/qemu-barrier.h
+++ b/qemu-barrier.h
@@ -7,12 +7,13 @@
 #if defined(__i386__)
 
 /*
- * Because of the strongly ordered x86 storage model, wmb() is a nop
+ * Because of the strongly ordered x86 storage model, wmb() and rmb() are nops
  * on x86(well, a compiler barrier only).  Well, at least as long as
  * qemu doesn't do accesses to write-combining memory or non-temporal
  * load/stores from C code.
  */
 #define smp_wmb()   barrier()
+#define smp_rmb()   barrier()
 /*
  * We use GCC builtin if it's available, as that can use
  * mfence on 32 bit as well, e.g. if built with -march=pentium-m.
@@ -27,6 +28,7 @@
 #elif defined(__x86_64__)
 
 #define smp_wmb()   barrier()
+#define smp_rmb()   barrier()
 #define smp_mb() asm volatile("mfence" ::: "memory")
 
 #elif defined(_ARCH_PPC)
@@ -37,6 +39,13 @@
  * each other
  */
 #define smp_wmb()   asm volatile("eieio" ::: "memory")
+
+#if defined(__powerpc64__)
+#define smp_rmb()   asm volatile("lwsync" ::: "memory")
+#else
+#define smp_rmb()   asm volatile("sync" ::: "memory")
+#endif
+
 #define smp_mb()   asm volatile("sync" ::: "memory")
 
 #else
@@ -45,10 +54,11 @@
  * For (host) platforms we don't have explicit barrier definitions
  * for, we use the gcc __sync_synchronize() primitive to generate a
  * full barrier.  This should be safe on all platforms, though it may
- * be overkill for wmb().
+ * be overkill for wmb() and rmb().
  */
 #define smp_wmb()   __sync_synchronize()
 #define smp_mb()   __sync_synchronize()
+#define smp_rmb()   __sync_synchronize()
 
 #endif
 

From 69f3ce78cc0b6941fc76dcbe2118269a16ddf043 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Tue, 10 Apr 2012 20:48:54 +0200
Subject: [PATCH 05/12] eepro100: Fix multicast regression

Commit 7fc8d918b9674c3e9233d6d25da2457345d414a0 removed code from
eepro100.c and replaced it by different code: the code in net.c
returns bits 31...26, but eepro100 needs bits 7...2.

This patch partially reverts 7fc8d918b9674c3e9233d6d25da2457345d414a0.
To avoid future problems, I renamed the function and changed the comment.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/eepro100.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/hw/eepro100.c b/hw/eepro100.c
index 02e6f7ef31..6279ae36ec 100644
--- a/hw/eepro100.c
+++ b/hw/eepro100.c
@@ -322,8 +322,32 @@ static const uint16_t eepro100_mdi_mask[] = {
     0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
 };
 
+#define POLYNOMIAL 0x04c11db6
+
 static E100PCIDeviceInfo *eepro100_get_class(EEPRO100State *s);
 
+/* From FreeBSD (locally modified). */
+static unsigned e100_compute_mcast_idx(const uint8_t *ep)
+{
+    uint32_t crc;
+    int carry, i, j;
+    uint8_t b;
+
+    crc = 0xffffffff;
+    for (i = 0; i < 6; i++) {
+        b = *ep++;
+        for (j = 0; j < 8; j++) {
+            carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01);
+            crc <<= 1;
+            b >>= 1;
+            if (carry) {
+                crc = ((crc ^ POLYNOMIAL) | carry);
+            }
+        }
+    }
+    return (crc & BITS(7, 2)) >> 2;
+}
+
 /* Read a 16 bit control/status (CSR) register. */
 static uint16_t e100_read_reg2(EEPRO100State *s, E100RegisterOffset addr)
 {
@@ -823,7 +847,7 @@ static void set_multicast_list(EEPRO100State *s)
         uint8_t multicast_addr[6];
         pci_dma_read(&s->dev, s->cb_address + 10 + i, multicast_addr, 6);
         TRACE(OTHER, logout("multicast entry %s\n", nic_dump(multicast_addr, 6)));
-        unsigned mcast_idx = compute_mcast_idx(multicast_addr);
+        unsigned mcast_idx = e100_compute_mcast_idx(multicast_addr);
         assert(mcast_idx < 64);
         s->mult[mcast_idx >> 3] |= (1 << (mcast_idx & 7));
     }
@@ -1650,7 +1674,7 @@ static ssize_t nic_receive(VLANClientState *nc, const uint8_t * buf, size_t size
         if (s->configuration[21] & BIT(3)) {
           /* Multicast all bit is set, receive all multicast frames. */
         } else {
-          unsigned mcast_idx = compute_mcast_idx(buf);
+          unsigned mcast_idx = e100_compute_mcast_idx(buf);
           assert(mcast_idx < 64);
           if (s->mult[mcast_idx >> 3] & (1 << (mcast_idx & 7))) {
             /* Multicast frame is allowed in hash table. */

From 2e54cc21ad60219229df569a857a453285795565 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:01:42 +0800
Subject: [PATCH 06/12] e1000: introduce bits of PHY control register

This would be used be following patches.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000_hw.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/hw/e1000_hw.h b/hw/e1000_hw.h
index 9e29af8c82..c9cb79e64d 100644
--- a/hw/e1000_hw.h
+++ b/hw/e1000_hw.h
@@ -349,6 +349,18 @@
 #define M88E1000_PHY_VCO_REG_BIT8  0x100 /* Bits 8 & 11 are adjusted for */
 #define M88E1000_PHY_VCO_REG_BIT11 0x800    /* improved BER performance */
 
+/* PHY Control Register */
+#define MII_CR_SPEED_SELECT_MSB 0x0040 /* bits 6,13: 10=1000, 01=100, 00=10 */
+#define MII_CR_COLL_TEST_ENABLE 0x0080 /* Collision test enable */
+#define MII_CR_FULL_DUPLEX      0x0100 /* FDX =1, half duplex =0 */
+#define MII_CR_RESTART_AUTO_NEG 0x0200 /* Restart auto negotiation */
+#define MII_CR_ISOLATE          0x0400 /* Isolate PHY from MII */
+#define MII_CR_POWER_DOWN       0x0800 /* Power down */
+#define MII_CR_AUTO_NEG_EN      0x1000 /* Auto Neg Enable */
+#define MII_CR_SPEED_SELECT_LSB 0x2000 /* bits 6,13: 10=1000, 01=100, 00=10 */
+#define MII_CR_LOOPBACK         0x4000 /* 0 = normal, 1 = loopback */
+#define MII_CR_RESET            0x8000 /* 0 = normal, 1 = PHY reset */
+
 /* PHY Status Register */
 #define MII_SR_EXTENDED_CAPS     0x0001	/* Extended register capabilities */
 #define MII_SR_JABBER_DETECT     0x0002	/* Jabber Detected */

From 17fbbb0b3d0b09f07d288991248c81f441bf6941 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:01:50 +0800
Subject: [PATCH 07/12] e1000: conditionally raise irq at the end of MDI cycle

According to the spec:

"When set to 1b by software, it causes an Interrupt to be
asserted to indicate the end of an MDI cycle."

We need check the Interrupt Enable bit and raise irq only when it is
set.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index dd067680c9..4e787bc2cf 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -274,7 +274,10 @@ set_mdic(E1000State *s, int index, uint32_t val)
             s->phy_reg[addr] = data;
     }
     s->mac_reg[MDIC] = val | E1000_MDIC_READY;
-    set_ics(s, 0, E1000_ICR_MDAC);
+
+    if (val & E1000_MDIC_INT_EN) {
+        set_ics(s, 0, E1000_ICR_MDAC);
+    }
 }
 
 static uint32_t

From 93e37d769074bebfd04c6704b7a7ae92736876e3 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:01:59 +0800
Subject: [PATCH 08/12] e1000: PHY loopback mode support

The missing of loopback mode prevent the running of self diagnosis
program in guest. This patch adds this support.

After this patch, loopback test of ethtool were passed in guest.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index 4e787bc2cf..4a09e39975 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -391,6 +391,16 @@ fcs_len(E1000State *s)
     return (s->mac_reg[RCTL] & E1000_RCTL_SECRC) ? 0 : 4;
 }
 
+static void
+e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
+{
+    if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
+        s->nic->nc.info->receive(&s->nic->nc, buf, size);
+    } else {
+        qemu_send_packet(&s->nic->nc, buf, size);
+    }
+}
+
 static void
 xmit_seg(E1000State *s)
 {
@@ -440,9 +450,9 @@ xmit_seg(E1000State *s)
         memmove(tp->vlan, tp->data, 4);
         memmove(tp->data, tp->data + 4, 8);
         memcpy(tp->data + 8, tp->vlan_header, 4);
-        qemu_send_packet(&s->nic->nc, tp->vlan, tp->size + 4);
+        e1000_send_packet(s, tp->vlan, tp->size + 4);
     } else
-        qemu_send_packet(&s->nic->nc, tp->data, tp->size);
+        e1000_send_packet(s, tp->data, tp->size);
     s->mac_reg[TPT]++;
     s->mac_reg[GPTC]++;
     n = s->mac_reg[TOTL];

From 71aadd3cd2b2f8bd3113a08a12d4a06d48b6c39c Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:02:07 +0800
Subject: [PATCH 09/12] e1000: introduce helpers to manipulate link status

This patch introduces helpers to change link status bit for phy/mac
register. This would help to reduce code duplication and would be used
by following patches.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index 4a09e39975..6abec6083b 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -142,6 +142,20 @@ enum {
     defreg(VET),
 };
 
+static void
+e1000_link_down(E1000State *s)
+{
+    s->mac_reg[STATUS] &= ~E1000_STATUS_LU;
+    s->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
+}
+
+static void
+e1000_link_up(E1000State *s)
+{
+    s->mac_reg[STATUS] |= E1000_STATUS_LU;
+    s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS;
+}
+
 enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W };
 static const char phy_regcap[0x20] = {
     [PHY_STATUS] = PHY_R,	[M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
@@ -228,8 +242,7 @@ static void e1000_reset(void *opaque)
     memset(&d->tx, 0, sizeof d->tx);
 
     if (d->nic->nc.link_down) {
-        d->mac_reg[STATUS] &= ~E1000_STATUS_LU;
-        d->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
+        e1000_link_down(d);
     }
 }
 
@@ -675,11 +688,9 @@ e1000_set_link_status(VLANClientState *nc)
     uint32_t old_status = s->mac_reg[STATUS];
 
     if (nc->link_down) {
-        s->mac_reg[STATUS] &= ~E1000_STATUS_LU;
-        s->phy_reg[PHY_STATUS] &= ~MII_SR_LINK_STATUS;
+        e1000_link_down(s);
     } else {
-        s->mac_reg[STATUS] |= E1000_STATUS_LU;
-        s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS;
+        e1000_link_up(s);
     }
 
     if (s->mac_reg[STATUS] != old_status)

From f9c1cdf4928e6c036a6373e683d1774129a10311 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:02:16 +0800
Subject: [PATCH 10/12] e1000: introduce bit for debugging PHY emulation

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index 6abec6083b..5f0c2f6d30 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -42,7 +42,7 @@ enum {
     DEBUG_GENERAL,	DEBUG_IO,	DEBUG_MMIO,	DEBUG_INTERRUPT,
     DEBUG_RX,		DEBUG_TX,	DEBUG_MDIC,	DEBUG_EEPROM,
     DEBUG_UNKNOWN,	DEBUG_TXSUM,	DEBUG_TXERR,	DEBUG_RXERR,
-    DEBUG_RXFILTER,	DEBUG_NOTYET,
+    DEBUG_RXFILTER,     DEBUG_PHY,      DEBUG_NOTYET,
 };
 #define DBGBIT(x)	(1<<DEBUG_##x)
 static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);

From b9d03e352cb6b31a66545763f6a1e20c9abf0c2c Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:02:24 +0800
Subject: [PATCH 11/12] e1000: link auto-negotiation emulation

Indeed, there's nothing else except for the time spent on the
negotiation needs to be emulated. This is needed for resuming windows
guest from hibernation, as without a proper delay, qemu would send the
packet too early ( guest even does not have a proper intr handler),
which could lead windows guest hang.

This patch first introduces an array of function pointers to make it
possible to emulate per-register write behavior. Then traps the
PHY_CTRL register write and when guest want to restart the link auto
negotiation, we would down the link and mark the auto negotiation in
progress in PHY_STATUS register. After time, a timer with 500 ms (
which is the minimum timeout of auto-negotation specified in 802.3
spec). The link would be up when timer expired.

Test with resuming windows guest plus flood ping and linux ethtool
linkstatus test.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index 5f0c2f6d30..b5c26e0dd8 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -125,6 +125,8 @@ typedef struct E1000State_st {
         uint16_t reading;
         uint32_t old_eecd;
     } eecd_state;
+
+    QEMUTimer *autoneg_timer;
 } E1000State;
 
 #define	defreg(x)	x = (E1000_##x>>2)
@@ -156,6 +158,34 @@ e1000_link_up(E1000State *s)
     s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS;
 }
 
+static void
+set_phy_ctrl(E1000State *s, int index, uint16_t val)
+{
+    if ((val & MII_CR_AUTO_NEG_EN) && (val & MII_CR_RESTART_AUTO_NEG)) {
+        s->nic->nc.link_down = true;
+        e1000_link_down(s);
+        s->phy_reg[PHY_STATUS] &= ~MII_SR_AUTONEG_COMPLETE;
+        DBGOUT(PHY, "Start link auto negotiation\n");
+        qemu_mod_timer(s->autoneg_timer, qemu_get_clock_ms(vm_clock) + 500);
+    }
+}
+
+static void
+e1000_autoneg_timer(void *opaque)
+{
+    E1000State *s = opaque;
+    s->nic->nc.link_down = false;
+    e1000_link_up(s);
+    s->phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE;
+    DBGOUT(PHY, "Auto negotiation is completed\n");
+}
+
+static void (*phyreg_writeops[])(E1000State *, int, uint16_t) = {
+    [PHY_CTRL] = set_phy_ctrl,
+};
+
+enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) };
+
 enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W };
 static const char phy_regcap[0x20] = {
     [PHY_STATUS] = PHY_R,	[M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
@@ -167,7 +197,8 @@ static const char phy_regcap[0x20] = {
 };
 
 static const uint16_t phy_reg_init[] = {
-    [PHY_CTRL] = 0x1140,			[PHY_STATUS] = 0x796d, // link initially up
+    [PHY_CTRL] = 0x1140,
+    [PHY_STATUS] = 0x794d, /* link initially up with not completed autoneg */
     [PHY_ID1] = 0x141,				[PHY_ID2] = PHY_ID2_INIT,
     [PHY_1000T_CTRL] = 0x0e00,			[M88E1000_PHY_SPEC_CTRL] = 0x360,
     [M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60,	[PHY_AUTONEG_ADV] = 0xde1,
@@ -234,6 +265,7 @@ static void e1000_reset(void *opaque)
 {
     E1000State *d = opaque;
 
+    qemu_del_timer(d->autoneg_timer);
     memset(d->phy_reg, 0, sizeof d->phy_reg);
     memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
     memset(d->mac_reg, 0, sizeof d->mac_reg);
@@ -283,8 +315,12 @@ set_mdic(E1000State *s, int index, uint32_t val)
         if (!(phy_regcap[addr] & PHY_W)) {
             DBGOUT(MDIC, "MDIC write reg %x unhandled\n", addr);
             val |= E1000_MDIC_ERROR;
-        } else
+        } else {
+            if (addr < NPHYWRITEOPS && phyreg_writeops[addr]) {
+                phyreg_writeops[addr](s, index, data);
+            }
             s->phy_reg[addr] = data;
+        }
     }
     s->mac_reg[MDIC] = val | E1000_MDIC_READY;
 
@@ -965,6 +1001,7 @@ static void (*macreg_writeops[])(E1000State *, int, uint32_t) = {
     [MTA ... MTA+127] = &mac_writereg,
     [VFTA ... VFTA+127] = &mac_writereg,
 };
+
 enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) };
 
 static void
@@ -1158,6 +1195,8 @@ pci_e1000_uninit(PCIDevice *dev)
 {
     E1000State *d = DO_UPCAST(E1000State, dev, dev);
 
+    qemu_del_timer(d->autoneg_timer);
+    qemu_free_timer(d->autoneg_timer);
     memory_region_destroy(&d->mmio);
     memory_region_destroy(&d->io);
     qemu_del_vlan_client(&d->nic->nc);
@@ -1212,6 +1251,8 @@ static int pci_e1000_init(PCIDevice *pci_dev)
 
     add_boot_device_path(d->conf.bootindex, &pci_dev->qdev, "/ethernet-phy@0");
 
+    d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer, d);
+
     return 0;
 }
 

From f1219091edd20e3b92544025c2b6dd5e4d98b61b Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 22 Mar 2012 18:02:34 +0800
Subject: [PATCH 12/12] e1000: set E1000_ICR_INT_ASSERTED only for 8257x

E1000_ICR_INT_ASSERTED were introduced only for 8257x, so we need to
check the E1000_DEVID before setting this bit in ICS.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/e1000.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/e1000.c b/hw/e1000.c
index b5c26e0dd8..4573f1301e 100644
--- a/hw/e1000.c
+++ b/hw/e1000.c
@@ -223,8 +223,10 @@ static const uint32_t mac_reg_init[] = {
 static void
 set_interrupt_cause(E1000State *s, int index, uint32_t val)
 {
-    if (val)
+    if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
+        /* Only for 8257x */
         val |= E1000_ICR_INT_ASSERTED;
+    }
     s->mac_reg[ICR] = val;
     s->mac_reg[ICS] = val;
     qemu_set_irq(s->dev.irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);