From fc02e8cb0300c0146e1d4668a75663eb225d8182 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sat, 9 Oct 2021 10:52:50 -0400
Subject: [PATCH 01/45] virtio_net: clarify tailroom logic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make tailroom math follow same logic as everything else, subtracing
values in the order in which things are laid out in the buffer.

Tested-by: Corentin Noël <corentin.noel@collabora.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/net/virtio_net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ad25a8b0870..6d8c8745bf24 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,12 +406,13 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	 * add_recvbuf_mergeable() + get_mergeable_buf_len()
 	 */
 	truesize = headroom ? PAGE_SIZE : truesize;
-	tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len);
+	tailroom = truesize - headroom;
 	buf = p - headroom;
 
 	len -= hdr_len;
 	offset += hdr_padded_len;
 	p += hdr_padded_len;
+	tailroom -= hdr_padded_len + len;
 
 	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 

From 02746e26c39ee473b975e0f68d1295abc92672ed Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Wed, 1 Sep 2021 16:14:34 +0300
Subject: [PATCH 02/45] virtio-blk: avoid preallocating big SGL for data

No need to pre-allocate a big buffer for the IO SGL anymore. If a device
has lots of deep queues, preallocation for the sg list can consume
substantial amounts of memory. For HW virtio-blk device, nr_hw_queues
can be 64 or 128 and each queue's depth might be 128. This means the
resulting preallocation for the data SGLs is big.

Switch to runtime allocation for SGL for lists longer than 2 entries.
This is the approach used by NVMe drivers so it should be reasonable for
virtio block as well. Runtime SGL allocation has always been the case
for the legacy I/O path so this is nothing new.

The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't
support SG_CHAIN, use only runtime allocation for the SGL.

Re-organize the setup of the IO request to fit the new sg chain
mechanism.

No performance degradation was seen (fio libaio engine with 16 jobs and
128 iodepth):

IO size      IOPs Rand Read (before/after)         IOPs Rand Write (before/after)
--------     ---------------------------------    ----------------------------------
512B          318K/316K                                    329K/325K

4KB           323K/321K                                    353K/349K

16KB          199K/208K                                    250K/275K

128KB         36K/36.1K                                    39.2K/41.7K

Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Reviewed-by: Israel Rukshin <israelr@nvidia.com>
Link: https://lore.kernel.org/r/20210901131434.31158-1-mgurtovoy@nvidia.com
Reviewed-by: Feng Li <lifeng1519@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de> # kconfig fixups
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/Kconfig      |   1 +
 drivers/block/virtio_blk.c | 156 ++++++++++++++++++++++++-------------
 2 files changed, 101 insertions(+), 56 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ab3e37aa1830..33a3f77dce27 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -394,6 +394,7 @@ config XEN_BLKDEV_BACKEND
 config VIRTIO_BLK
 	tristate "Virtio block driver"
 	depends on VIRTIO
+	select SG_POOL
 	help
 	  This is the virtual block driver for virtio.  It can be used with
           QEMU based VMMs (like KVM or Xen).  Say Y or M.
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 303caf2d17d0..6767fb709d25 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -24,6 +24,12 @@
 /* The maximum number of sg elements that fit into a virtqueue */
 #define VIRTIO_BLK_MAX_SG_ELEMS 32768
 
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+#define VIRTIO_BLK_INLINE_SG_CNT	0
+#else
+#define VIRTIO_BLK_INLINE_SG_CNT	2
+#endif
+
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -77,6 +83,7 @@ struct virtio_blk {
 struct virtblk_req {
 	struct virtio_blk_outhdr out_hdr;
 	u8 status;
+	struct sg_table sg_table;
 	struct scatterlist sg[];
 };
 
@@ -162,12 +169,92 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
 	return 0;
 }
 
+static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
+{
+	if (blk_rq_nr_phys_segments(req))
+		sg_free_table_chained(&vbr->sg_table,
+				      VIRTIO_BLK_INLINE_SG_CNT);
+}
+
+static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
+		struct virtblk_req *vbr)
+{
+	int err;
+
+	if (!blk_rq_nr_phys_segments(req))
+		return 0;
+
+	vbr->sg_table.sgl = vbr->sg;
+	err = sg_alloc_table_chained(&vbr->sg_table,
+				     blk_rq_nr_phys_segments(req),
+				     vbr->sg_table.sgl,
+				     VIRTIO_BLK_INLINE_SG_CNT);
+	if (unlikely(err))
+		return -ENOMEM;
+
+	return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
+}
+
+static void virtblk_cleanup_cmd(struct request *req)
+{
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+		kfree(bvec_virt(&req->special_vec));
+}
+
+static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req,
+		struct virtblk_req *vbr)
+{
+	bool unmap = false;
+	u32 type;
+
+	vbr->out_hdr.sector = 0;
+
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		type = VIRTIO_BLK_T_IN;
+		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
+						      blk_rq_pos(req));
+		break;
+	case REQ_OP_WRITE:
+		type = VIRTIO_BLK_T_OUT;
+		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
+						      blk_rq_pos(req));
+		break;
+	case REQ_OP_FLUSH:
+		type = VIRTIO_BLK_T_FLUSH;
+		break;
+	case REQ_OP_DISCARD:
+		type = VIRTIO_BLK_T_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		type = VIRTIO_BLK_T_WRITE_ZEROES;
+		unmap = !(req->cmd_flags & REQ_NOUNMAP);
+		break;
+	case REQ_OP_DRV_IN:
+		type = VIRTIO_BLK_T_GET_ID;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_STS_IOERR;
+	}
+
+	vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
+	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
+
+	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
+		if (virtblk_setup_discard_write_zeroes(req, unmap))
+			return BLK_STS_RESOURCE;
+	}
+
+	return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
-	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
-		kfree(bvec_virt(&req->special_vec));
+	virtblk_unmap_data(req, vbr);
+	virtblk_cleanup_cmd(req);
 	blk_mq_end_request(req, virtblk_result(vbr));
 }
 
@@ -225,57 +312,23 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	int qid = hctx->queue_num;
 	int err;
 	bool notify = false;
-	bool unmap = false;
-	u32 type;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
-	switch (req_op(req)) {
-	case REQ_OP_READ:
-	case REQ_OP_WRITE:
-		type = 0;
-		break;
-	case REQ_OP_FLUSH:
-		type = VIRTIO_BLK_T_FLUSH;
-		break;
-	case REQ_OP_DISCARD:
-		type = VIRTIO_BLK_T_DISCARD;
-		break;
-	case REQ_OP_WRITE_ZEROES:
-		type = VIRTIO_BLK_T_WRITE_ZEROES;
-		unmap = !(req->cmd_flags & REQ_NOUNMAP);
-		break;
-	case REQ_OP_DRV_IN:
-		type = VIRTIO_BLK_T_GET_ID;
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		return BLK_STS_IOERR;
-	}
-
-	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
-	vbr->out_hdr.sector = type ?
-		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
-	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
+	err = virtblk_setup_cmd(vblk->vdev, req, vbr);
+	if (unlikely(err))
+		return err;
 
 	blk_mq_start_request(req);
 
-	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
-		err = virtblk_setup_discard_write_zeroes(req, unmap);
-		if (err)
-			return BLK_STS_RESOURCE;
-	}
-
-	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
-	if (num) {
-		if (rq_data_dir(req) == WRITE)
-			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
-		else
-			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
+	num = virtblk_map_data(hctx, req, vbr);
+	if (unlikely(num < 0)) {
+		virtblk_cleanup_cmd(req);
+		return BLK_STS_RESOURCE;
 	}
 
 	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
-	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
+	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num);
 	if (err) {
 		virtqueue_kick(vblk->vqs[qid].vq);
 		/* Don't stop the queue if -ENOMEM: we may have failed to
@@ -284,6 +337,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 		if (err == -ENOSPC)
 			blk_mq_stop_hw_queue(hctx);
 		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+		virtblk_unmap_data(req, vbr);
+		virtblk_cleanup_cmd(req);
 		switch (err) {
 		case -ENOSPC:
 			return BLK_STS_DEV_RESOURCE;
@@ -660,16 +715,6 @@ static const struct attribute_group *virtblk_attr_groups[] = {
 	NULL,
 };
 
-static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
-		unsigned int hctx_idx, unsigned int numa_node)
-{
-	struct virtio_blk *vblk = set->driver_data;
-	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
-
-	sg_init_table(vbr->sg, vblk->sg_elems);
-	return 0;
-}
-
 static int virtblk_map_queues(struct blk_mq_tag_set *set)
 {
 	struct virtio_blk *vblk = set->driver_data;
@@ -682,7 +727,6 @@ static const struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
 	.commit_rqs	= virtio_commit_rqs,
 	.complete	= virtblk_request_done,
-	.init_request	= virtblk_init_request,
 	.map_queues	= virtblk_map_queues,
 };
 
@@ -762,7 +806,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	vblk->tag_set.cmd_size =
 		sizeof(struct virtblk_req) +
-		sizeof(struct scatterlist) * sg_elems;
+		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
 	vblk->tag_set.driver_data = vblk;
 	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
 

From 0989c41bed96e5dcf7939c6303e3759f02c4c16f Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 2 Sep 2021 23:46:22 +0300
Subject: [PATCH 03/45] virtio-blk: add num_request_queues module parameter

Sometimes a user would like to control the amount of request queues to
be created for a block device. For example, for limiting the memory
footprint of virtio-blk devices.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Link: https://lore.kernel.org/r/20210902204622.54354-1-mgurtovoy@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6767fb709d25..a33fe0743672 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,25 @@
 #define VIRTIO_BLK_INLINE_SG_CNT	2
 #endif
 
+static int virtblk_queue_count_set(const char *val,
+		const struct kernel_param *kp)
+{
+	return param_set_uint_minmax(val, kp, 1, nr_cpu_ids);
+}
+
+static const struct kernel_param_ops queue_count_ops = {
+	.set = virtblk_queue_count_set,
+	.get = param_get_uint,
+};
+
+static unsigned int num_request_queues;
+module_param_cb(num_request_queues, &queue_count_ops, &num_request_queues,
+		0644);
+MODULE_PARM_DESC(num_request_queues,
+		 "Limit the number of request queues to use for blk device. "
+		 "0 for no limit. "
+		 "Values > nr_cpu_ids truncated to nr_cpu_ids.");
+
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -553,7 +572,9 @@ static int init_vq(struct virtio_blk *vblk)
 	if (err)
 		num_vqs = 1;
 
-	num_vqs = min_t(unsigned int, nr_cpu_ids, num_vqs);
+	num_vqs = min_t(unsigned int,
+			min_not_zero(num_request_queues, nr_cpu_ids),
+			num_vqs);
 
 	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
 	if (!vblk->vqs)

From d89c8169bd7052c78731137da4c4c06986409c62 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:42 +0800
Subject: [PATCH 04/45] virtio-pci: introduce legacy device module

Split common codes from virtio-pci-legacy so vDPA driver can reuse it
later.

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/71605acde5e97fcb2760a6973e406279fb1bbd33.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/Kconfig                 |  10 ++
 drivers/virtio/Makefile                |   1 +
 drivers/virtio/virtio_pci_common.c     |  10 +-
 drivers/virtio/virtio_pci_common.h     |   9 +-
 drivers/virtio/virtio_pci_legacy.c     | 101 +++---------
 drivers/virtio/virtio_pci_legacy_dev.c | 220 +++++++++++++++++++++++++
 include/linux/virtio_pci_legacy.h      |  42 +++++
 7 files changed, 310 insertions(+), 83 deletions(-)
 create mode 100644 drivers/virtio/virtio_pci_legacy_dev.c
 create mode 100644 include/linux/virtio_pci_legacy.h

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index ce1b3f6ec325..8fcf94cd2c96 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -20,6 +20,15 @@ config VIRTIO_PCI_LIB
 	  PCI device with possible vendor specific extensions. Any
 	  module that selects this module must depend on PCI.
 
+config VIRTIO_PCI_LIB_LEGACY
+	tristate
+	help
+	  Legacy PCI device (Virtio PCI Card 0.9.x Draft and older device)
+	  implementation.
+	  This module implements the basic probe and control for devices
+	  which are based on legacy PCI device. Any module that selects this
+	  module must depend on PCI.
+
 menuconfig VIRTIO_MENU
 	bool "Virtio drivers"
 	default y
@@ -43,6 +52,7 @@ config VIRTIO_PCI_LEGACY
 	bool "Support for legacy virtio draft 0.9.X and older devices"
 	default y
 	depends on VIRTIO_PCI
+	select VIRTIO_PCI_LIB_LEGACY
 	help
           Virtio PCI Card 0.9.X Draft (circa 2014) and older device support.
 
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 699bbea0465f..0a82d0873248 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o
 obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o
+obj-$(CONFIG_VIRTIO_PCI_LIB_LEGACY) += virtio_pci_legacy_dev.o
 obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o
 obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o
 virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index b35bb2d57f62..d724f676608b 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -549,6 +549,8 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 
 	pci_set_master(pci_dev);
 
+	vp_dev->is_legacy = vp_dev->ldev.ioaddr ? true : false;
+
 	rc = register_virtio_device(&vp_dev->vdev);
 	reg_dev = vp_dev;
 	if (rc)
@@ -557,10 +559,10 @@ static int virtio_pci_probe(struct pci_dev *pci_dev,
 	return 0;
 
 err_register:
-	if (vp_dev->ioaddr)
-	     virtio_pci_legacy_remove(vp_dev);
+	if (vp_dev->is_legacy)
+		virtio_pci_legacy_remove(vp_dev);
 	else
-	     virtio_pci_modern_remove(vp_dev);
+		virtio_pci_modern_remove(vp_dev);
 err_probe:
 	pci_disable_device(pci_dev);
 err_enable_device:
@@ -587,7 +589,7 @@ static void virtio_pci_remove(struct pci_dev *pci_dev)
 
 	unregister_virtio_device(&vp_dev->vdev);
 
-	if (vp_dev->ioaddr)
+	if (vp_dev->is_legacy)
 		virtio_pci_legacy_remove(vp_dev);
 	else
 		virtio_pci_modern_remove(vp_dev);
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index beec047a8f8d..eb17a29fc7ef 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -25,6 +25,7 @@
 #include <linux/virtio_config.h>
 #include <linux/virtio_ring.h>
 #include <linux/virtio_pci.h>
+#include <linux/virtio_pci_legacy.h>
 #include <linux/virtio_pci_modern.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
@@ -44,16 +45,14 @@ struct virtio_pci_vq_info {
 struct virtio_pci_device {
 	struct virtio_device vdev;
 	struct pci_dev *pci_dev;
+	struct virtio_pci_legacy_device ldev;
 	struct virtio_pci_modern_device mdev;
 
-	/* In legacy mode, these two point to within ->legacy. */
+	bool is_legacy;
+
 	/* Where to read and clear interrupt */
 	u8 __iomem *isr;
 
-	/* Legacy only field */
-	/* the IO mapping for the PCI config space */
-	void __iomem *ioaddr;
-
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
 	struct list_head virtqueues;
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index d62e9835aeec..82eb437ad920 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -14,6 +14,7 @@
  *  Michael S. Tsirkin <mst@redhat.com>
  */
 
+#include "linux/virtio_pci_legacy.h"
 #include "virtio_pci_common.h"
 
 /* virtio config->get_features() implementation */
@@ -23,7 +24,7 @@ static u64 vp_get_features(struct virtio_device *vdev)
 
 	/* When someone needs more than 32 feature bits, we'll need to
 	 * steal a bit to indicate that the rest are somewhere else. */
-	return ioread32(vp_dev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
+	return vp_legacy_get_features(&vp_dev->ldev);
 }
 
 /* virtio config->finalize_features() implementation */
@@ -38,7 +39,7 @@ static int vp_finalize_features(struct virtio_device *vdev)
 	BUG_ON((u32)vdev->features != vdev->features);
 
 	/* We only support 32 feature bits. */
-	iowrite32(vdev->features, vp_dev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+	vp_legacy_set_features(&vp_dev->ldev, vdev->features);
 
 	return 0;
 }
@@ -48,7 +49,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
+	void __iomem *ioaddr = vp_dev->ldev.ioaddr +
 			VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled) +
 			offset;
 	u8 *ptr = buf;
@@ -64,7 +65,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
+	void __iomem *ioaddr = vp_dev->ldev.ioaddr +
 			VIRTIO_PCI_CONFIG_OFF(vp_dev->msix_enabled) +
 			offset;
 	const u8 *ptr = buf;
@@ -78,7 +79,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 static u8 vp_get_status(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	return ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	return vp_legacy_get_status(&vp_dev->ldev);
 }
 
 static void vp_set_status(struct virtio_device *vdev, u8 status)
@@ -86,28 +87,24 @@ static void vp_set_status(struct virtio_device *vdev, u8 status)
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	/* We should never be setting status to 0. */
 	BUG_ON(status == 0);
-	iowrite8(status, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	vp_legacy_set_status(&vp_dev->ldev, status);
 }
 
 static void vp_reset(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	/* 0 status means a reset. */
-	iowrite8(0, vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	vp_legacy_set_status(&vp_dev->ldev, 0);
 	/* Flush out the status write, and flush in device writes,
 	 * including MSi-X interrupts, if any. */
-	ioread8(vp_dev->ioaddr + VIRTIO_PCI_STATUS);
+	vp_legacy_get_status(&vp_dev->ldev);
 	/* Flush pending VQ/configuration callbacks. */
 	vp_synchronize_vectors(vdev);
 }
 
 static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
 {
-	/* Setup the vector used for configuration events */
-	iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
-	/* Verify we had enough resources to assign the vector */
-	/* Will also flush the write out to device */
-	return ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+	return vp_legacy_config_vector(&vp_dev->ldev, vector);
 }
 
 static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
@@ -123,12 +120,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	int err;
 	u64 q_pfn;
 
-	/* Select the queue we're interested in */
-	iowrite16(index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
-
 	/* Check if queue is either not available or already active. */
-	num = ioread16(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
-	if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN))
+	num = vp_legacy_get_queue_size(&vp_dev->ldev, index);
+	if (!num || vp_legacy_get_queue_enable(&vp_dev->ldev, index))
 		return ERR_PTR(-ENOENT);
 
 	info->msix_vector = msix_vec;
@@ -151,13 +145,12 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	}
 
 	/* activate the queue */
-	iowrite32(q_pfn, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+	vp_legacy_set_queue_address(&vp_dev->ldev, index, q_pfn);
 
-	vq->priv = (void __force *)vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
+	vq->priv = (void __force *)vp_dev->ldev.ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
 
 	if (msix_vec != VIRTIO_MSI_NO_VECTOR) {
-		iowrite16(msix_vec, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
-		msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		msix_vec = vp_legacy_queue_vector(&vp_dev->ldev, index, msix_vec);
 		if (msix_vec == VIRTIO_MSI_NO_VECTOR) {
 			err = -EBUSY;
 			goto out_deactivate;
@@ -167,7 +160,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	return vq;
 
 out_deactivate:
-	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+	vp_legacy_set_queue_address(&vp_dev->ldev, index, 0);
 out_del_vq:
 	vring_del_virtqueue(vq);
 	return ERR_PTR(err);
@@ -178,17 +171,15 @@ static void del_vq(struct virtio_pci_vq_info *info)
 	struct virtqueue *vq = info->vq;
 	struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
 
-	iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
-
 	if (vp_dev->msix_enabled) {
-		iowrite16(VIRTIO_MSI_NO_VECTOR,
-			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		vp_legacy_queue_vector(&vp_dev->ldev, vq->index,
+				VIRTIO_MSI_NO_VECTOR);
 		/* Flush the write out to device */
-		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+		ioread8(vp_dev->ldev.ioaddr + VIRTIO_PCI_ISR);
 	}
 
 	/* Select and deactivate the queue */
-	iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+	vp_legacy_set_queue_address(&vp_dev->ldev, vq->index, 0);
 
 	vring_del_virtqueue(vq);
 }
@@ -211,51 +202,18 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 /* the PCI probing function */
 int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
 {
+	struct virtio_pci_legacy_device *ldev = &vp_dev->ldev;
 	struct pci_dev *pci_dev = vp_dev->pci_dev;
 	int rc;
 
-	/* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
-	if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
-		return -ENODEV;
+	ldev->pci_dev = pci_dev;
 
-	if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) {
-		printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n",
-		       VIRTIO_PCI_ABI_VERSION, pci_dev->revision);
-		return -ENODEV;
-	}
-
-	rc = dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32));
-	} else {
-		/*
-		 * The virtio ring base address is expressed as a 32-bit PFN,
-		 * with a page size of 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT.
-		 */
-		dma_set_coherent_mask(&pci_dev->dev,
-				DMA_BIT_MASK(32 + VIRTIO_PCI_QUEUE_ADDR_SHIFT));
-	}
-
-	if (rc)
-		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
-
-	rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
+	rc = vp_legacy_probe(ldev);
 	if (rc)
 		return rc;
 
-	rc = -ENOMEM;
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (!vp_dev->ioaddr)
-		goto err_iomap;
-
-	vp_dev->isr = vp_dev->ioaddr + VIRTIO_PCI_ISR;
-
-	/* we use the subsystem vendor/device id as the virtio vendor/device
-	 * id.  this allows us to use the same PCI vendor/device id for all
-	 * virtio devices and to identify the particular virtio driver by
-	 * the subsystem ids */
-	vp_dev->vdev.id.vendor = pci_dev->subsystem_vendor;
-	vp_dev->vdev.id.device = pci_dev->subsystem_device;
+	vp_dev->isr = ldev->isr;
+	vp_dev->vdev.id = ldev->id;
 
 	vp_dev->vdev.config = &virtio_pci_config_ops;
 
@@ -264,16 +222,11 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev)
 	vp_dev->del_vq = del_vq;
 
 	return 0;
-
-err_iomap:
-	pci_release_region(pci_dev, 0);
-	return rc;
 }
 
 void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev)
 {
-	struct pci_dev *pci_dev = vp_dev->pci_dev;
+	struct virtio_pci_legacy_device *ldev = &vp_dev->ldev;
 
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
-	pci_release_region(pci_dev, 0);
+	vp_legacy_remove(ldev);
 }
diff --git a/drivers/virtio/virtio_pci_legacy_dev.c b/drivers/virtio/virtio_pci_legacy_dev.c
new file mode 100644
index 000000000000..9b97680dd02b
--- /dev/null
+++ b/drivers/virtio/virtio_pci_legacy_dev.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "linux/virtio_pci.h"
+#include <linux/virtio_pci_legacy.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+
+/*
+ * vp_legacy_probe: probe the legacy virtio pci device, note that the
+ * caller is required to enable PCI device before calling this function.
+ * @ldev: the legacy virtio-pci device
+ *
+ * Return 0 on succeed otherwise fail
+ */
+int vp_legacy_probe(struct virtio_pci_legacy_device *ldev)
+{
+	struct pci_dev *pci_dev = ldev->pci_dev;
+	int rc;
+
+	/* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
+	if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f)
+		return -ENODEV;
+
+	if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION)
+		return -ENODEV;
+
+	rc = dma_set_mask(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(32));
+	} else {
+		/*
+		 * The virtio ring base address is expressed as a 32-bit PFN,
+		 * with a page size of 1 << VIRTIO_PCI_QUEUE_ADDR_SHIFT.
+		 */
+		dma_set_coherent_mask(&pci_dev->dev,
+				DMA_BIT_MASK(32 + VIRTIO_PCI_QUEUE_ADDR_SHIFT));
+	}
+
+	if (rc)
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA.  Trying to continue, but this might not work.\n");
+
+	rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy");
+	if (rc)
+		return rc;
+
+	ldev->ioaddr = pci_iomap(pci_dev, 0, 0);
+	if (!ldev->ioaddr)
+		goto err_iomap;
+
+	ldev->isr = ldev->ioaddr + VIRTIO_PCI_ISR;
+
+	ldev->id.vendor = pci_dev->subsystem_vendor;
+	ldev->id.device = pci_dev->subsystem_device;
+
+	return 0;
+err_iomap:
+	pci_release_region(pci_dev, 0);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(vp_legacy_probe);
+
+/*
+ * vp_legacy_probe: remove and cleanup the legacy virtio pci device
+ * @ldev: the legacy virtio-pci device
+ */
+void vp_legacy_remove(struct virtio_pci_legacy_device *ldev)
+{
+	struct pci_dev *pci_dev = ldev->pci_dev;
+
+	pci_iounmap(pci_dev, ldev->ioaddr);
+	pci_release_region(pci_dev, 0);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_remove);
+
+/*
+ * vp_legacy_get_features - get features from device
+ * @ldev: the legacy virtio-pci device
+ *
+ * Returns the features read from the device
+ */
+u64 vp_legacy_get_features(struct virtio_pci_legacy_device *ldev)
+{
+
+	return ioread32(ldev->ioaddr + VIRTIO_PCI_HOST_FEATURES);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_features);
+
+/*
+ * vp_legacy_get_driver_features - get driver features from device
+ * @ldev: the legacy virtio-pci device
+ *
+ * Returns the driver features read from the device
+ */
+u64 vp_legacy_get_driver_features(struct virtio_pci_legacy_device *ldev)
+{
+	return ioread32(ldev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_driver_features);
+
+/*
+ * vp_legacy_set_features - set features to device
+ * @ldev: the legacy virtio-pci device
+ * @features: the features set to device
+ */
+void vp_legacy_set_features(struct virtio_pci_legacy_device *ldev,
+			    u32 features)
+{
+	iowrite32(features, ldev->ioaddr + VIRTIO_PCI_GUEST_FEATURES);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_set_features);
+
+/*
+ * vp_legacy_get_status - get the device status
+ * @ldev: the legacy virtio-pci device
+ *
+ * Returns the status read from device
+ */
+u8 vp_legacy_get_status(struct virtio_pci_legacy_device *ldev)
+{
+	return ioread8(ldev->ioaddr + VIRTIO_PCI_STATUS);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_status);
+
+/*
+ * vp_legacy_set_status - set status to device
+ * @ldev: the legacy virtio-pci device
+ * @status: the status set to device
+ */
+void vp_legacy_set_status(struct virtio_pci_legacy_device *ldev,
+				 u8 status)
+{
+	iowrite8(status, ldev->ioaddr + VIRTIO_PCI_STATUS);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_set_status);
+
+/*
+ * vp_legacy_queue_vector - set the MSIX vector for a specific virtqueue
+ * @ldev: the legacy virtio-pci device
+ * @index: queue index
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev,
+			   u16 index, u16 vector)
+{
+	iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	iowrite16(vector, ldev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+	/* Flush the write out to device */
+	return ioread16(ldev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_queue_vector);
+
+/*
+ * vp_legacy_config_vector - set the vector for config interrupt
+ * @ldev: the legacy virtio-pci device
+ * @vector: the config vector
+ *
+ * Returns the config vector read from the device
+ */
+u16 vp_legacy_config_vector(struct virtio_pci_legacy_device *ldev,
+			    u16 vector)
+{
+	/* Setup the vector used for configuration events */
+	iowrite16(vector, ldev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+	/* Verify we had enough resources to assign the vector */
+	/* Will also flush the write out to device */
+	return ioread16(ldev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_config_vector);
+
+/*
+ * vp_legacy_set_queue_address - set the virtqueue address
+ * @ldev: the legacy virtio-pci device
+ * @index: the queue index
+ * @queue_pfn: pfn of the virtqueue
+ */
+void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev,
+			     u16 index, u32 queue_pfn)
+{
+	iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	iowrite32(queue_pfn, ldev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_set_queue_address);
+
+/*
+ * vp_legacy_get_queue_enable - enable a virtqueue
+ * @ldev: the legacy virtio-pci device
+ * @index: the queue index
+ *
+ * Returns whether a virtqueue is enabled or not
+ */
+bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev,
+				u16 index)
+{
+	iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	return ioread32(ldev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_queue_enable);
+
+/*
+ * vp_legacy_get_queue_size - get size for a virtqueue
+ * @ldev: the legacy virtio-pci device
+ * @index: the queue index
+ *
+ * Returns the size of the virtqueue
+ */
+u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev,
+			     u16 index)
+{
+	iowrite16(index, ldev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
+	return ioread16(ldev->ioaddr + VIRTIO_PCI_QUEUE_NUM);
+}
+EXPORT_SYMBOL_GPL(vp_legacy_get_queue_size);
+
+MODULE_VERSION("0.1");
+MODULE_DESCRIPTION("Legacy Virtio PCI Device");
+MODULE_AUTHOR("Wu Zongyong <wuzongyong@linux.alibaba.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio_pci_legacy.h b/include/linux/virtio_pci_legacy.h
new file mode 100644
index 000000000000..e5d665faf00e
--- /dev/null
+++ b/include/linux/virtio_pci_legacy.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_PCI_LEGACY_H
+#define _LINUX_VIRTIO_PCI_LEGACY_H
+
+#include "linux/mod_devicetable.h"
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+
+struct virtio_pci_legacy_device {
+	struct pci_dev *pci_dev;
+
+	/* Where to read and clear interrupt */
+	u8 __iomem *isr;
+	/* The IO mapping for the PCI config space (legacy mode only) */
+	void __iomem *ioaddr;
+
+	struct virtio_device_id id;
+};
+
+u64 vp_legacy_get_features(struct virtio_pci_legacy_device *ldev);
+u64 vp_legacy_get_driver_features(struct virtio_pci_legacy_device *ldev);
+void vp_legacy_set_features(struct virtio_pci_legacy_device *ldev,
+			u32 features);
+u8 vp_legacy_get_status(struct virtio_pci_legacy_device *ldev);
+void vp_legacy_set_status(struct virtio_pci_legacy_device *ldev,
+			u8 status);
+u16 vp_legacy_queue_vector(struct virtio_pci_legacy_device *ldev,
+			   u16 idx, u16 vector);
+u16 vp_legacy_config_vector(struct virtio_pci_legacy_device *ldev,
+		     u16 vector);
+void vp_legacy_set_queue_address(struct virtio_pci_legacy_device *ldev,
+			     u16 index, u32 queue_pfn);
+bool vp_legacy_get_queue_enable(struct virtio_pci_legacy_device *ldev,
+				u16 idx);
+void vp_legacy_set_queue_size(struct virtio_pci_legacy_device *ldev,
+			      u16 idx, u16 size);
+u16 vp_legacy_get_queue_size(struct virtio_pci_legacy_device *ldev,
+			     u16 idx);
+int vp_legacy_probe(struct virtio_pci_legacy_device *ldev);
+void vp_legacy_remove(struct virtio_pci_legacy_device *ldev);
+
+#endif

From d0ae1fbfcff48e889bf993ba16890e30f6615593 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:43 +0800
Subject: [PATCH 05/45] vdpa: fix typo

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/4b5153262e4ba64986bb567d7425ad4829ca7bcc.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 3972ab765de1..a896ee021e5f 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -257,7 +257,7 @@ struct vdpa_config_ops {
 	struct vdpa_notification_area
 	(*get_vq_notification)(struct vdpa_device *vdev, u16 idx);
 	/* vq irq is not expected to be changed once DRIVER_OK is set */
-	int (*get_vq_irq)(struct vdpa_device *vdv, u16 idx);
+	int (*get_vq_irq)(struct vdpa_device *vdev, u16 idx);
 
 	/* Device ops */
 	u32 (*get_vq_align)(struct vdpa_device *vdev);

From 5bbfea1eacdf584d0d159e38c01ee190162706d9 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:44 +0800
Subject: [PATCH 06/45] vp_vdpa: add vq irq offloading support

This patch implements the get_vq_irq() callback for virtio pci devices
to allow irq offloading.

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/bb091e5505db704dd620f8854a7aebc921d2a752.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/virtio_pci/vp_vdpa.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
index 5bcd00246d2e..e3ff7875e123 100644
--- a/drivers/vdpa/virtio_pci/vp_vdpa.c
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -76,6 +76,17 @@ static u8 vp_vdpa_get_status(struct vdpa_device *vdpa)
 	return vp_modern_get_status(mdev);
 }
 
+static int vp_vdpa_get_vq_irq(struct vdpa_device *vdpa, u16 idx)
+{
+	struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
+	int irq = vp_vdpa->vring[idx].irq;
+
+	if (irq == VIRTIO_MSI_NO_VECTOR)
+		return -EINVAL;
+
+	return irq;
+}
+
 static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa)
 {
 	struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev;
@@ -427,6 +438,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = {
 	.get_config	= vp_vdpa_get_config,
 	.set_config	= vp_vdpa_set_config,
 	.set_config_cb  = vp_vdpa_set_config_cb,
+	.get_vq_irq	= vp_vdpa_get_vq_irq,
 };
 
 static void vp_vdpa_free_irq_vectors(void *data)

From 3b970a5842c9114c82e60744c84a7d06ee51b6f9 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:45 +0800
Subject: [PATCH 07/45] vdpa: add new callback get_vq_num_min in
 vdpa_config_ops

This callback is optional. For vdpa devices that not support to change
virtqueue size, get_vq_num_min and get_vq_num_max will return the same
value, so that users can choose a correct value for that device.

Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/f4af5b0abd660d9a29ab6b2f67bd6df10284a230.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index a896ee021e5f..30864848950b 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -171,6 +171,9 @@ struct vdpa_map_file {
  * @get_vq_num_max:		Get the max size of virtqueue
  *				@vdev: vdpa device
  *				Returns u16: max size of virtqueue
+ * @get_vq_num_min:		Get the min size of virtqueue (optional)
+ *				@vdev: vdpa device
+ *				Returns u16: min size of virtqueue
  * @get_device_id:		Get virtio device id
  *				@vdev: vdpa device
  *				Returns u32: virtio device id
@@ -266,6 +269,7 @@ struct vdpa_config_ops {
 	void (*set_config_cb)(struct vdpa_device *vdev,
 			      struct vdpa_callback *cb);
 	u16 (*get_vq_num_max)(struct vdpa_device *vdev);
+	u16 (*get_vq_num_min)(struct vdpa_device *vdev);
 	u32 (*get_device_id)(struct vdpa_device *vdev);
 	u32 (*get_vendor_id)(struct vdpa_device *vdev);
 	u8 (*get_status)(struct vdpa_device *vdev);

From c53e5d1b5ea46cfd6acb4a51c324b2ec03e89e76 Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:46 +0800
Subject: [PATCH 08/45] vdpa: min vq num of vdpa device cannot be greater than
 max vq num

Just failed to probe the vdpa device if the min virtqueue num returned
by get_vq_num_min is greater than the max virtqueue num returned by
get_vq_num_max.

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/21199b62cc10b2a9f2cf90eeb63ad080645d881f.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 1dc121a07a93..d783a943647d 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -26,8 +26,16 @@ static int vdpa_dev_probe(struct device *d)
 {
 	struct vdpa_device *vdev = dev_to_vdpa(d);
 	struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver);
+	const struct vdpa_config_ops *ops = vdev->config;
+	u32 max_num, min_num = 1;
 	int ret = 0;
 
+	max_num = ops->get_vq_num_max(vdev);
+	if (ops->get_vq_num_min)
+		min_num = ops->get_vq_num_min(vdev);
+	if (max_num < min_num)
+		return -EINVAL;
+
 	if (drv && drv->probe)
 		ret = drv->probe(vdev);
 

From 30a03dfcbbdac22ade72a38b953e0709fbf35baa Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:47 +0800
Subject: [PATCH 09/45] virtio_vdpa: setup correct vq size with callbacks
 get_vq_num_{max,min}

For the devices which implement the get_vq_num_min callback, the driver
should not negotiate with virtqueue size with the backend vdpa device if
the value returned by get_vq_num_min equals to the value returned by
get_vq_num_max.
This is useful for vdpa devices based on legacy virtio specfication.

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Link: https://lore.kernel.org/r/bc0551cec6c3f3dd9424b678b7c22d882aebab3a.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_vdpa.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 72eaef2caeb1..6b62aaf08cc5 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -145,7 +145,8 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 	/* Assume split virtqueue, switch to packed if necessary */
 	struct vdpa_vq_state state = {0};
 	unsigned long flags;
-	u32 align, num;
+	u32 align, max_num, min_num = 1;
+	bool may_reduce_num = true;
 	int err;
 
 	if (!name)
@@ -163,16 +164,21 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index,
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	num = ops->get_vq_num_max(vdpa);
-	if (num == 0) {
+	max_num = ops->get_vq_num_max(vdpa);
+	if (max_num == 0) {
 		err = -ENOENT;
 		goto error_new_virtqueue;
 	}
 
+	if (ops->get_vq_num_min)
+		min_num = ops->get_vq_num_min(vdpa);
+
+	may_reduce_num = (max_num == min_num) ? false : true;
+
 	/* Create the vring */
 	align = ops->get_vq_align(vdpa);
-	vq = vring_create_virtqueue(index, num, align, vdev,
-				    true, true, ctx,
+	vq = vring_create_virtqueue(index, max_num, align, vdev,
+				    true, may_reduce_num, ctx,
 				    virtio_vdpa_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;

From e47be840e87ea15677bca2043ee7b696ccacf56a Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:48 +0800
Subject: [PATCH 10/45] vdpa: add new attribute VDPA_ATTR_DEV_MIN_VQ_SIZE

This attribute advertises the min value of virtqueue size. The value is
1 by default.

Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Link: https://lore.kernel.org/r/2bbc417355c4d22298050b1ba887cecfbde3e85d.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 5 +++++
 include/uapi/linux/vdpa.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index d783a943647d..fcf02a364878 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -500,6 +500,7 @@ vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq
 	      int flags, struct netlink_ext_ack *extack)
 {
 	u16 max_vq_size;
+	u16 min_vq_size = 1;
 	u32 device_id;
 	u32 vendor_id;
 	void *hdr;
@@ -516,6 +517,8 @@ vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq
 	device_id = vdev->config->get_device_id(vdev);
 	vendor_id = vdev->config->get_vendor_id(vdev);
 	max_vq_size = vdev->config->get_vq_num_max(vdev);
+	if (vdev->config->get_vq_num_min)
+		min_vq_size = vdev->config->get_vq_num_min(vdev);
 
 	err = -EMSGSIZE;
 	if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev)))
@@ -528,6 +531,8 @@ vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq
 		goto msg_err;
 	if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size))
 		goto msg_err;
+	if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size))
+		goto msg_err;
 
 	genlmsg_end(msg, hdr);
 	return 0;
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index 66a41e4ec163..e3b87879514c 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -32,6 +32,7 @@ enum vdpa_attr {
 	VDPA_ATTR_DEV_VENDOR_ID,		/* u32 */
 	VDPA_ATTR_DEV_MAX_VQS,			/* u32 */
 	VDPA_ATTR_DEV_MAX_VQ_SIZE,		/* u16 */
+	VDPA_ATTR_DEV_MIN_VQ_SIZE,		/* u16 */
 
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,

From e85087beedcae97e81e5d361d7d9337aa0db6f4c Mon Sep 17 00:00:00 2001
From: Wu Zongyong <wuzongyong@linux.alibaba.com>
Date: Fri, 29 Oct 2021 17:14:49 +0800
Subject: [PATCH 11/45] eni_vdpa: add vDPA driver for Alibaba ENI

This patch adds a new vDPA driver for Alibaba ENI(Elastic Network
Interface) which is build upon virtio 0.9.5 specification.
And this driver is only enabled on X86 host currently.

Link: https://lore.kernel.org/r/6a9f32c00609af16bbb2ea32e633b3beb1cbf84b.1635493219.git.wuzongyong@linux.alibaba.com
Signed-off-by: Wu Zongyong <wuzongyong@linux.alibaba.com>
Link: https://lore.kernel.org/r/20211026083214.3375383-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de> # fix Kconfig typo
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/Kconfig            |   8 +
 drivers/vdpa/Makefile           |   1 +
 drivers/vdpa/alibaba/Makefile   |   3 +
 drivers/vdpa/alibaba/eni_vdpa.c | 553 ++++++++++++++++++++++++++++++++
 4 files changed, 565 insertions(+)
 create mode 100644 drivers/vdpa/alibaba/Makefile
 create mode 100644 drivers/vdpa/alibaba/eni_vdpa.c

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 3d91982d8371..50f45d037611 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -78,4 +78,12 @@ config VP_VDPA
 	help
 	  This kernel module bridges virtio PCI device to vDPA bus.
 
+config ALIBABA_ENI_VDPA
+	tristate "vDPA driver for Alibaba ENI"
+	select VIRTIO_PCI_LIB_LEGACY
+	depends on PCI_MSI && X86
+	help
+	  VDPA driver for Alibaba ENI (Elastic Network Interface) which is built upon
+	  virtio 0.9.5 specification.
+
 endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index f02ebed33f19..15665563a7f4 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_VDPA_USER) += vdpa_user/
 obj-$(CONFIG_IFCVF)    += ifcvf/
 obj-$(CONFIG_MLX5_VDPA) += mlx5/
 obj-$(CONFIG_VP_VDPA)    += virtio_pci/
+obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
diff --git a/drivers/vdpa/alibaba/Makefile b/drivers/vdpa/alibaba/Makefile
new file mode 100644
index 000000000000..ef4aae69f87a
--- /dev/null
+++ b/drivers/vdpa/alibaba/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ALIBABA_ENI_VDPA) += eni_vdpa.o
+
diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c
new file mode 100644
index 000000000000..3f788794571a
--- /dev/null
+++ b/drivers/vdpa/alibaba/eni_vdpa.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vDPA bridge driver for Alibaba ENI(Elastic Network Interface)
+ *
+ * Copyright (c) 2021, Alibaba Inc. All rights reserved.
+ * Author: Wu Zongyong <wuzongyong@linux.alibaba.com>
+ *
+ */
+
+#include "linux/bits.h"
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/vdpa.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_pci_legacy.h>
+#include <uapi/linux/virtio_net.h>
+
+#define ENI_MSIX_NAME_SIZE 256
+
+#define ENI_ERR(pdev, fmt, ...)	\
+	dev_err(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__)
+#define ENI_DBG(pdev, fmt, ...)	\
+	dev_dbg(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__)
+#define ENI_INFO(pdev, fmt, ...) \
+	dev_info(&pdev->dev, "%s"fmt, "eni_vdpa: ", ##__VA_ARGS__)
+
+struct eni_vring {
+	void __iomem *notify;
+	char msix_name[ENI_MSIX_NAME_SIZE];
+	struct vdpa_callback cb;
+	int irq;
+};
+
+struct eni_vdpa {
+	struct vdpa_device vdpa;
+	struct virtio_pci_legacy_device ldev;
+	struct eni_vring *vring;
+	struct vdpa_callback config_cb;
+	char msix_name[ENI_MSIX_NAME_SIZE];
+	int config_irq;
+	int queues;
+	int vectors;
+};
+
+static struct eni_vdpa *vdpa_to_eni(struct vdpa_device *vdpa)
+{
+	return container_of(vdpa, struct eni_vdpa, vdpa);
+}
+
+static struct virtio_pci_legacy_device *vdpa_to_ldev(struct vdpa_device *vdpa)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+	return &eni_vdpa->ldev;
+}
+
+static u64 eni_vdpa_get_features(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+	u64 features = vp_legacy_get_features(ldev);
+
+	features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
+	features |= BIT_ULL(VIRTIO_F_ORDER_PLATFORM);
+
+	return features;
+}
+
+static int eni_vdpa_set_features(struct vdpa_device *vdpa, u64 features)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	if (!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) && features) {
+		ENI_ERR(ldev->pci_dev,
+			"VIRTIO_NET_F_MRG_RXBUF is not negotiated\n");
+		return -EINVAL;
+	}
+
+	vp_legacy_set_features(ldev, (u32)features);
+
+	return 0;
+}
+
+static u8 eni_vdpa_get_status(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	return vp_legacy_get_status(ldev);
+}
+
+static int eni_vdpa_get_vq_irq(struct vdpa_device *vdpa, u16 idx)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+	int irq = eni_vdpa->vring[idx].irq;
+
+	if (irq == VIRTIO_MSI_NO_VECTOR)
+		return -EINVAL;
+
+	return irq;
+}
+
+static void eni_vdpa_free_irq(struct eni_vdpa *eni_vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	struct pci_dev *pdev = ldev->pci_dev;
+	int i;
+
+	for (i = 0; i < eni_vdpa->queues; i++) {
+		if (eni_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) {
+			vp_legacy_queue_vector(ldev, i, VIRTIO_MSI_NO_VECTOR);
+			devm_free_irq(&pdev->dev, eni_vdpa->vring[i].irq,
+				      &eni_vdpa->vring[i]);
+			eni_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+		}
+	}
+
+	if (eni_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) {
+		vp_legacy_config_vector(ldev, VIRTIO_MSI_NO_VECTOR);
+		devm_free_irq(&pdev->dev, eni_vdpa->config_irq, eni_vdpa);
+		eni_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+	}
+
+	if (eni_vdpa->vectors) {
+		pci_free_irq_vectors(pdev);
+		eni_vdpa->vectors = 0;
+	}
+}
+
+static irqreturn_t eni_vdpa_vq_handler(int irq, void *arg)
+{
+	struct eni_vring *vring = arg;
+
+	if (vring->cb.callback)
+		return vring->cb.callback(vring->cb.private);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t eni_vdpa_config_handler(int irq, void *arg)
+{
+	struct eni_vdpa *eni_vdpa = arg;
+
+	if (eni_vdpa->config_cb.callback)
+		return eni_vdpa->config_cb.callback(eni_vdpa->config_cb.private);
+
+	return IRQ_HANDLED;
+}
+
+static int eni_vdpa_request_irq(struct eni_vdpa *eni_vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	struct pci_dev *pdev = ldev->pci_dev;
+	int i, ret, irq;
+	int queues = eni_vdpa->queues;
+	int vectors = queues + 1;
+
+	ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX);
+	if (ret != vectors) {
+		ENI_ERR(pdev,
+			"failed to allocate irq vectors want %d but %d\n",
+			vectors, ret);
+		return ret;
+	}
+
+	eni_vdpa->vectors = vectors;
+
+	for (i = 0; i < queues; i++) {
+		snprintf(eni_vdpa->vring[i].msix_name, ENI_MSIX_NAME_SIZE,
+			 "eni-vdpa[%s]-%d\n", pci_name(pdev), i);
+		irq = pci_irq_vector(pdev, i);
+		ret = devm_request_irq(&pdev->dev, irq,
+				       eni_vdpa_vq_handler,
+				       0, eni_vdpa->vring[i].msix_name,
+				       &eni_vdpa->vring[i]);
+		if (ret) {
+			ENI_ERR(pdev, "failed to request irq for vq %d\n", i);
+			goto err;
+		}
+		vp_legacy_queue_vector(ldev, i, i);
+		eni_vdpa->vring[i].irq = irq;
+	}
+
+	snprintf(eni_vdpa->msix_name, ENI_MSIX_NAME_SIZE, "eni-vdpa[%s]-config\n",
+		 pci_name(pdev));
+	irq = pci_irq_vector(pdev, queues);
+	ret = devm_request_irq(&pdev->dev, irq, eni_vdpa_config_handler, 0,
+			       eni_vdpa->msix_name, eni_vdpa);
+	if (ret) {
+		ENI_ERR(pdev, "failed to request irq for config vq %d\n", i);
+		goto err;
+	}
+	vp_legacy_config_vector(ldev, queues);
+	eni_vdpa->config_irq = irq;
+
+	return 0;
+err:
+	eni_vdpa_free_irq(eni_vdpa);
+	return ret;
+}
+
+static void eni_vdpa_set_status(struct vdpa_device *vdpa, u8 status)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	u8 s = eni_vdpa_get_status(vdpa);
+
+	if (status & VIRTIO_CONFIG_S_DRIVER_OK &&
+	    !(s & VIRTIO_CONFIG_S_DRIVER_OK)) {
+		eni_vdpa_request_irq(eni_vdpa);
+	}
+
+	vp_legacy_set_status(ldev, status);
+
+	if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) &&
+	    (s & VIRTIO_CONFIG_S_DRIVER_OK))
+		eni_vdpa_free_irq(eni_vdpa);
+}
+
+static int eni_vdpa_reset(struct vdpa_device *vdpa)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	u8 s = eni_vdpa_get_status(vdpa);
+
+	vp_legacy_set_status(ldev, 0);
+
+	if (s & VIRTIO_CONFIG_S_DRIVER_OK)
+		eni_vdpa_free_irq(eni_vdpa);
+
+	return 0;
+}
+
+static u16 eni_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	return vp_legacy_get_queue_size(ldev, 0);
+}
+
+static u16 eni_vdpa_get_vq_num_min(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	return vp_legacy_get_queue_size(ldev, 0);
+}
+
+static int eni_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid,
+				struct vdpa_vq_state *state)
+{
+	return -EOPNOTSUPP;
+}
+
+static int eni_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid,
+				 const struct vdpa_vq_state *state)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+	const struct vdpa_vq_state_split *split = &state->split;
+
+	/* ENI is build upon virtio-pci specfication which not support
+	 * to set state of virtqueue. But if the state is equal to the
+	 * device initial state by chance, we can let it go.
+	 */
+	if (!vp_legacy_get_queue_enable(ldev, qid)
+	    && split->avail_index == 0)
+		return 0;
+
+	return -EOPNOTSUPP;
+}
+
+
+static void eni_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid,
+			       struct vdpa_callback *cb)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+	eni_vdpa->vring[qid].cb = *cb;
+}
+
+static void eni_vdpa_set_vq_ready(struct vdpa_device *vdpa, u16 qid,
+				  bool ready)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	/* ENI is a legacy virtio-pci device. This is not supported
+	 * by specification. But we can disable virtqueue by setting
+	 * address to 0.
+	 */
+	if (!ready)
+		vp_legacy_set_queue_address(ldev, qid, 0);
+}
+
+static bool eni_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	return vp_legacy_get_queue_enable(ldev, qid);
+}
+
+static void eni_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid,
+			       u32 num)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+	struct pci_dev *pdev = ldev->pci_dev;
+	u16 n = vp_legacy_get_queue_size(ldev, qid);
+
+	/* ENI is a legacy virtio-pci device which not allow to change
+	 * virtqueue size. Just report a error if someone tries to
+	 * change it.
+	 */
+	if (num != n)
+		ENI_ERR(pdev,
+			"not support to set vq %u fixed num %u to %u\n",
+			qid, n, num);
+}
+
+static int eni_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid,
+				   u64 desc_area, u64 driver_area,
+				   u64 device_area)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+	u32 pfn = desc_area >> VIRTIO_PCI_QUEUE_ADDR_SHIFT;
+
+	vp_legacy_set_queue_address(ldev, qid, pfn);
+
+	return 0;
+}
+
+static void eni_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+	iowrite16(qid, eni_vdpa->vring[qid].notify);
+}
+
+static u32 eni_vdpa_get_device_id(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	return ldev->id.device;
+}
+
+static u32 eni_vdpa_get_vendor_id(struct vdpa_device *vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = vdpa_to_ldev(vdpa);
+
+	return ldev->id.vendor;
+}
+
+static u32 eni_vdpa_get_vq_align(struct vdpa_device *vdpa)
+{
+	return VIRTIO_PCI_VRING_ALIGN;
+}
+
+static size_t eni_vdpa_get_config_size(struct vdpa_device *vdpa)
+{
+	return sizeof(struct virtio_net_config);
+}
+
+
+static void eni_vdpa_get_config(struct vdpa_device *vdpa,
+				unsigned int offset,
+				void *buf, unsigned int len)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	void __iomem *ioaddr = ldev->ioaddr +
+		VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) +
+		offset;
+	u8 *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		*p++ = ioread8(ioaddr + i);
+}
+
+static void eni_vdpa_set_config(struct vdpa_device *vdpa,
+				unsigned int offset, const void *buf,
+				unsigned int len)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	void __iomem *ioaddr = ldev->ioaddr +
+		VIRTIO_PCI_CONFIG_OFF(eni_vdpa->vectors) +
+		offset;
+	const u8 *p = buf;
+	int i;
+
+	for (i = 0; i < len; i++)
+		iowrite8(*p++, ioaddr + i);
+}
+
+static void eni_vdpa_set_config_cb(struct vdpa_device *vdpa,
+				   struct vdpa_callback *cb)
+{
+	struct eni_vdpa *eni_vdpa = vdpa_to_eni(vdpa);
+
+	eni_vdpa->config_cb = *cb;
+}
+
+static const struct vdpa_config_ops eni_vdpa_ops = {
+	.get_features	= eni_vdpa_get_features,
+	.set_features	= eni_vdpa_set_features,
+	.get_status	= eni_vdpa_get_status,
+	.set_status	= eni_vdpa_set_status,
+	.reset		= eni_vdpa_reset,
+	.get_vq_num_max	= eni_vdpa_get_vq_num_max,
+	.get_vq_num_min	= eni_vdpa_get_vq_num_min,
+	.get_vq_state	= eni_vdpa_get_vq_state,
+	.set_vq_state	= eni_vdpa_set_vq_state,
+	.set_vq_cb	= eni_vdpa_set_vq_cb,
+	.set_vq_ready	= eni_vdpa_set_vq_ready,
+	.get_vq_ready	= eni_vdpa_get_vq_ready,
+	.set_vq_num	= eni_vdpa_set_vq_num,
+	.set_vq_address	= eni_vdpa_set_vq_address,
+	.kick_vq	= eni_vdpa_kick_vq,
+	.get_device_id	= eni_vdpa_get_device_id,
+	.get_vendor_id	= eni_vdpa_get_vendor_id,
+	.get_vq_align	= eni_vdpa_get_vq_align,
+	.get_config_size = eni_vdpa_get_config_size,
+	.get_config	= eni_vdpa_get_config,
+	.set_config	= eni_vdpa_set_config,
+	.set_config_cb  = eni_vdpa_set_config_cb,
+	.get_vq_irq	= eni_vdpa_get_vq_irq,
+};
+
+
+static u16 eni_vdpa_get_num_queues(struct eni_vdpa *eni_vdpa)
+{
+	struct virtio_pci_legacy_device *ldev = &eni_vdpa->ldev;
+	u32 features = vp_legacy_get_features(ldev);
+	u16 num = 2;
+
+	if (features & BIT_ULL(VIRTIO_NET_F_MQ)) {
+		__virtio16 max_virtqueue_pairs;
+
+		eni_vdpa_get_config(&eni_vdpa->vdpa,
+			offsetof(struct virtio_net_config, max_virtqueue_pairs),
+			&max_virtqueue_pairs,
+			sizeof(max_virtqueue_pairs));
+		num = 2 * __virtio16_to_cpu(virtio_legacy_is_little_endian(),
+				max_virtqueue_pairs);
+	}
+
+	if (features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))
+		num += 1;
+
+	return num;
+}
+
+static void eni_vdpa_free_irq_vectors(void *data)
+{
+	pci_free_irq_vectors(data);
+}
+
+static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	struct eni_vdpa *eni_vdpa;
+	struct virtio_pci_legacy_device *ldev;
+	int ret, i;
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa,
+				     dev, &eni_vdpa_ops, NULL, false);
+	if (IS_ERR(eni_vdpa)) {
+		ENI_ERR(pdev, "failed to allocate vDPA structure\n");
+		return PTR_ERR(eni_vdpa);
+	}
+
+	ldev = &eni_vdpa->ldev;
+	ldev->pci_dev = pdev;
+
+	ret = vp_legacy_probe(ldev);
+	if (ret) {
+		ENI_ERR(pdev, "failed to probe legacy PCI device\n");
+		goto err;
+	}
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, eni_vdpa);
+
+	eni_vdpa->vdpa.dma_dev = &pdev->dev;
+	eni_vdpa->queues = eni_vdpa_get_num_queues(eni_vdpa);
+
+	ret = devm_add_action_or_reset(dev, eni_vdpa_free_irq_vectors, pdev);
+	if (ret) {
+		ENI_ERR(pdev,
+			"failed for adding devres for freeing irq vectors\n");
+		goto err;
+	}
+
+	eni_vdpa->vring = devm_kcalloc(&pdev->dev, eni_vdpa->queues,
+				      sizeof(*eni_vdpa->vring),
+				      GFP_KERNEL);
+	if (!eni_vdpa->vring) {
+		ret = -ENOMEM;
+		ENI_ERR(pdev, "failed to allocate virtqueues\n");
+		goto err;
+	}
+
+	for (i = 0; i < eni_vdpa->queues; i++) {
+		eni_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR;
+		eni_vdpa->vring[i].notify = ldev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY;
+	}
+	eni_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR;
+
+	ret = vdpa_register_device(&eni_vdpa->vdpa, eni_vdpa->queues);
+	if (ret) {
+		ENI_ERR(pdev, "failed to register to vdpa bus\n");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	put_device(&eni_vdpa->vdpa.dev);
+	return ret;
+}
+
+static void eni_vdpa_remove(struct pci_dev *pdev)
+{
+	struct eni_vdpa *eni_vdpa = pci_get_drvdata(pdev);
+
+	vdpa_unregister_device(&eni_vdpa->vdpa);
+	vp_legacy_remove(&eni_vdpa->ldev);
+}
+
+static struct pci_device_id eni_pci_ids[] = {
+	{ PCI_DEVICE_SUB(PCI_VENDOR_ID_REDHAT_QUMRANET,
+			 VIRTIO_TRANS_ID_NET,
+			 PCI_SUBVENDOR_ID_REDHAT_QUMRANET,
+			 VIRTIO_ID_NET) },
+	{ 0 },
+};
+
+static struct pci_driver eni_vdpa_driver = {
+	.name		= "alibaba-eni-vdpa",
+	.id_table	= eni_pci_ids,
+	.probe		= eni_vdpa_probe,
+	.remove		= eni_vdpa_remove,
+};
+
+module_pci_driver(eni_vdpa_driver);
+
+MODULE_AUTHOR("Wu Zongyong <wuzongyong@linux.alibaba.com>");
+MODULE_DESCRIPTION("Alibaba ENI vDPA driver");
+MODULE_LICENSE("GPL v2");

From 246fd1caf0f4424a79fd7cb0a5fe69103ea7995b Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 9 Sep 2021 15:36:33 +0300
Subject: [PATCH 12/45] vdpa/mlx5: Remove mtu field from vdpa net device

No need to save the mtu int the net device struct. We can save it in the
config struct which cannot be modified.

Moreover, move the initialization to. mlx5_vdpa_set_features() callback
is not the right place to put it.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20210909123635.30884-2-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index bd56de7484dc..570bd5a8e068 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -159,7 +159,6 @@ struct mlx5_vdpa_net {
 	struct mlx5_fc *rx_counter;
 	struct mlx5_flow_handle *rx_rule;
 	bool setup;
-	u16 mtu;
 	u32 cur_num_vqs;
 };
 
@@ -1942,8 +1941,6 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
 		return err;
 
 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
-	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu);
-	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
 	update_cvq_info(mvdev);
 	return err;
 }
@@ -2413,6 +2410,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 	struct mlx5_vdpa_net *ndev;
 	struct mlx5_core_dev *mdev;
 	u32 max_vqs;
+	u16 mtu;
 	int err;
 
 	if (mgtdev->ndev)
@@ -2440,10 +2438,13 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 	init_mvqs(ndev);
 	mutex_init(&ndev->reslock);
 	config = &ndev->config;
-	err = query_mtu(mdev, &ndev->mtu);
+	err = query_mtu(mdev, &mtu);
 	if (err)
 		goto err_mtu;
 
+	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
+	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+
 	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
 	if (err)
 		goto err_mtu;

From 218bdd20e56cab41a68481bc10c551ae3e0a24fb Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 9 Sep 2021 15:36:34 +0300
Subject: [PATCH 13/45] vdpa/mlx5: Rename control VQ workqueue to vdpa wq

A subesequent patch will use the same workqueue for executing other
work not related to control VQ. Rename the workqueue and the work queue
entry used to convey information to the workqueue.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20210909123635.30884-3-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 2 +-
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 01a848adf590..81dc3d88d3dd 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -63,7 +63,7 @@ struct mlx5_control_vq {
 	unsigned short head;
 };
 
-struct mlx5_ctrl_wq_ent {
+struct mlx5_vdpa_wq_ent {
 	struct work_struct work;
 	struct mlx5_vdpa_dev *mvdev;
 };
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 570bd5a8e068..b17310911ba9 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1556,14 +1556,14 @@ static void mlx5_cvq_kick_handler(struct work_struct *work)
 {
 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
 	struct virtio_net_ctrl_hdr ctrl;
-	struct mlx5_ctrl_wq_ent *wqent;
+	struct mlx5_vdpa_wq_ent *wqent;
 	struct mlx5_vdpa_dev *mvdev;
 	struct mlx5_control_vq *cvq;
 	struct mlx5_vdpa_net *ndev;
 	size_t read, write;
 	int err;
 
-	wqent = container_of(work, struct mlx5_ctrl_wq_ent, work);
+	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
 	mvdev = wqent->mvdev;
 	ndev = to_mlx5_vdpa_ndev(mvdev);
 	cvq = &mvdev->cvq;
@@ -1615,7 +1615,7 @@ static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 	struct mlx5_vdpa_virtqueue *mvq;
-	struct mlx5_ctrl_wq_ent *wqent;
+	struct mlx5_vdpa_wq_ent *wqent;
 
 	if (!is_index_valid(mvdev, idx))
 		return;
@@ -2474,7 +2474,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 	if (err)
 		goto err_mr;
 
-	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_ctrl_wq");
+	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
 	if (!mvdev->wq) {
 		err = -ENOMEM;
 		goto err_res2;

From edf747affc41a18ccc3a616813d4c2b6d38b46ce Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 9 Sep 2021 15:36:35 +0300
Subject: [PATCH 14/45] vdpa/mlx5: Propagate link status from device to vdpa
 driver

Add code to register to hardware asynchronous events. Use this
mechanism to track link status events coming from the device and update
the config struct.

After doing link status change, call the vdpa callback to notify of the
link status change.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20210909123635.30884-4-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 94 ++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b17310911ba9..b5bd1a553256 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -160,6 +160,8 @@ struct mlx5_vdpa_net {
 	struct mlx5_flow_handle *rx_rule;
 	bool setup;
 	u32 cur_num_vqs;
+	struct notifier_block nb;
+	struct vdpa_callback config_cb;
 };
 
 static void free_resources(struct mlx5_vdpa_net *ndev);
@@ -1851,6 +1853,7 @@ static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev)
 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
 	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MQ);
+	ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
 
 	print_features(mvdev, ndev->mvdev.mlx_features, false);
 	return ndev->mvdev.mlx_features;
@@ -1947,8 +1950,10 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features)
 
 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
 {
-	/* not implemented */
-	mlx5_vdpa_warn(to_mvdev(vdev), "set config callback not supported\n");
+	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
+
+	ndev->config_cb = *cb;
 }
 
 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
@@ -2401,6 +2406,82 @@ struct mlx5_vdpa_mgmtdev {
 	struct mlx5_vdpa_net *ndev;
 };
 
+static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
+{
+	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
+	int err;
+
+	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
+	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
+	MLX5_SET(query_vport_state_in, in, vport_number, vport);
+	if (vport)
+		MLX5_SET(query_vport_state_in, in, other_vport, 1);
+
+	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
+	if (err)
+		return 0;
+
+	return MLX5_GET(query_vport_state_out, out, state);
+}
+
+static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
+{
+	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
+	    VPORT_STATE_UP)
+		return true;
+
+	return false;
+}
+
+static void update_carrier(struct work_struct *work)
+{
+	struct mlx5_vdpa_wq_ent *wqent;
+	struct mlx5_vdpa_dev *mvdev;
+	struct mlx5_vdpa_net *ndev;
+
+	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
+	mvdev = wqent->mvdev;
+	ndev = to_mlx5_vdpa_ndev(mvdev);
+	if (get_link_state(mvdev))
+		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+	else
+		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
+
+	if (ndev->config_cb.callback)
+		ndev->config_cb.callback(ndev->config_cb.private);
+
+	kfree(wqent);
+}
+
+static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
+{
+	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
+	struct mlx5_eqe *eqe = param;
+	int ret = NOTIFY_DONE;
+	struct mlx5_vdpa_wq_ent *wqent;
+
+	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
+		switch (eqe->sub_type) {
+		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
+			if (!wqent)
+				return NOTIFY_DONE;
+
+			wqent->mvdev = &ndev->mvdev;
+			INIT_WORK(&wqent->work, update_carrier);
+			queue_work(ndev->mvdev.wq, &wqent->work);
+			ret = NOTIFY_OK;
+			break;
+		default:
+			return NOTIFY_DONE;
+		}
+		return ret;
+	}
+	return ret;
+}
+
 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 {
 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
@@ -2449,6 +2530,11 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 	if (err)
 		goto err_mtu;
 
+	if (get_link_state(mvdev))
+		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
+	else
+		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
+
 	if (!is_zero_ether_addr(config->mac)) {
 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
@@ -2480,6 +2566,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 		goto err_res2;
 	}
 
+	ndev->nb.notifier_call = event_handler;
+	mlx5_notifier_register(mdev, &ndev->nb);
 	ndev->cur_num_vqs = 2 * mlx5_vdpa_max_qps(max_vqs);
 	mvdev->vdev.mdev = &mgtdev->mgtdev;
 	err = _vdpa_register_device(&mvdev->vdev, ndev->cur_num_vqs + 1);
@@ -2510,7 +2598,9 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *
 {
 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
+	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
 
+	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
 	destroy_workqueue(mvdev->wq);
 	_vdpa_unregister_device(dev);
 	mgtdev->ndev = NULL;

From bf3175bc50a3754dc427e2f5046e17a9fafc8be7 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 28 Oct 2021 12:11:08 +0200
Subject: [PATCH 15/45] hwrng: virtio - add an internal buffer

hwrng core uses two buffers that can be mixed in the
virtio-rng queue.

If the buffer is provided with wait=0 it is enqueued in the
virtio-rng queue but unused by the caller.
On the next call, core provides another buffer but the
first one is filled instead and the new one queued.
And the caller reads the data from the new one that is not
updated, and the data in the first one are lost.

To avoid this mix, virtio-rng needs to use its own unique
internal buffer at a cost of a data copy to the caller buffer.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Link: https://lore.kernel.org/r/20211028101111.128049-2-lvivier@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/char/hw_random/virtio-rng.c | 43 ++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index a90001e02bf7..208c547dcac1 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -18,13 +18,20 @@ static DEFINE_IDA(rng_index_ida);
 struct virtrng_info {
 	struct hwrng hwrng;
 	struct virtqueue *vq;
-	struct completion have_data;
 	char name[25];
-	unsigned int data_avail;
 	int index;
 	bool busy;
 	bool hwrng_register_done;
 	bool hwrng_removed;
+	/* data transfer */
+	struct completion have_data;
+	unsigned int data_avail;
+	/* minimal size returned by rng_buffer_size() */
+#if SMP_CACHE_BYTES < 32
+	u8 data[32];
+#else
+	u8 data[SMP_CACHE_BYTES];
+#endif
 };
 
 static void random_recv_done(struct virtqueue *vq)
@@ -39,14 +46,14 @@ static void random_recv_done(struct virtqueue *vq)
 }
 
 /* The host will fill any buffer we give it with sweet, sweet randomness. */
-static void register_buffer(struct virtrng_info *vi, u8 *buf, size_t size)
+static void register_buffer(struct virtrng_info *vi)
 {
 	struct scatterlist sg;
 
-	sg_init_one(&sg, buf, size);
+	sg_init_one(&sg, vi->data, sizeof(vi->data));
 
 	/* There should always be room for one buffer. */
-	virtqueue_add_inbuf(vi->vq, &sg, 1, buf, GFP_KERNEL);
+	virtqueue_add_inbuf(vi->vq, &sg, 1, vi->data, GFP_KERNEL);
 
 	virtqueue_kick(vi->vq);
 }
@@ -55,6 +62,8 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 {
 	int ret;
 	struct virtrng_info *vi = (struct virtrng_info *)rng->priv;
+	unsigned int chunk;
+	size_t read;
 
 	if (vi->hwrng_removed)
 		return -ENODEV;
@@ -62,19 +71,33 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 	if (!vi->busy) {
 		vi->busy = true;
 		reinit_completion(&vi->have_data);
-		register_buffer(vi, buf, size);
+		register_buffer(vi);
 	}
 
 	if (!wait)
 		return 0;
 
-	ret = wait_for_completion_killable(&vi->have_data);
-	if (ret < 0)
-		return ret;
+	read = 0;
+	while (size != 0) {
+		ret = wait_for_completion_killable(&vi->have_data);
+		if (ret < 0)
+			return ret;
+
+		chunk = min_t(unsigned int, size, vi->data_avail);
+		memcpy(buf + read, vi->data, chunk);
+		read += chunk;
+		size -= chunk;
+		vi->data_avail = 0;
+
+		if (size != 0) {
+			reinit_completion(&vi->have_data);
+			register_buffer(vi);
+		}
+	}
 
 	vi->busy = false;
 
-	return vi->data_avail;
+	return read;
 }
 
 static void virtio_cleanup(struct hwrng *rng)

From 2bb31abdbe55742c89f4dc0cc26fcbc8467364f6 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 28 Oct 2021 12:11:09 +0200
Subject: [PATCH 16/45] hwrng: virtio - don't wait on cleanup

When virtio-rng device was dropped by the hwrng core we were forced
to wait the buffer to come back from the device to not have
remaining ongoing operation that could spoil the buffer.

But now, as the buffer is internal to the virtio-rng we can release
the waiting loop immediately, the buffer will be retrieve and use
when the virtio-rng driver will be selected again.

This avoids to hang on an rng_current write command if the virtio-rng
device is blocked by a lack of entropy. This allows to select
another entropy source if the current one is empty.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Link: https://lore.kernel.org/r/20211028101111.128049-3-lvivier@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/char/hw_random/virtio-rng.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 208c547dcac1..173aeea835bb 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -82,6 +82,11 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 		ret = wait_for_completion_killable(&vi->have_data);
 		if (ret < 0)
 			return ret;
+		/* if vi->data_avail is 0, we have been interrupted
+		 * by a cleanup, but buffer stays in the queue
+		 */
+		if (vi->data_avail == 0)
+			return read;
 
 		chunk = min_t(unsigned int, size, vi->data_avail);
 		memcpy(buf + read, vi->data, chunk);
@@ -105,7 +110,7 @@ static void virtio_cleanup(struct hwrng *rng)
 	struct virtrng_info *vi = (struct virtrng_info *)rng->priv;
 
 	if (vi->busy)
-		wait_for_completion(&vi->have_data);
+		complete(&vi->have_data);
 }
 
 static int probe_common(struct virtio_device *vdev)

From 5c8e933050044d6dd2a000f9a5756ae73cbe7c44 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 28 Oct 2021 12:11:10 +0200
Subject: [PATCH 17/45] hwrng: virtio - don't waste entropy

if we don't use all the entropy available in the buffer, keep it
and use it later.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Link: https://lore.kernel.org/r/20211028101111.128049-4-lvivier@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/char/hw_random/virtio-rng.c | 52 +++++++++++++++++++----------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 173aeea835bb..8ba97cf4ca8f 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -26,6 +26,7 @@ struct virtrng_info {
 	/* data transfer */
 	struct completion have_data;
 	unsigned int data_avail;
+	unsigned int data_idx;
 	/* minimal size returned by rng_buffer_size() */
 #if SMP_CACHE_BYTES < 32
 	u8 data[32];
@@ -42,6 +43,9 @@ static void random_recv_done(struct virtqueue *vq)
 	if (!virtqueue_get_buf(vi->vq, &vi->data_avail))
 		return;
 
+	vi->data_idx = 0;
+	vi->busy = false;
+
 	complete(&vi->have_data);
 }
 
@@ -58,6 +62,16 @@ static void register_buffer(struct virtrng_info *vi)
 	virtqueue_kick(vi->vq);
 }
 
+static unsigned int copy_data(struct virtrng_info *vi, void *buf,
+			      unsigned int size)
+{
+	size = min_t(unsigned int, size, vi->data_avail);
+	memcpy(buf, vi->data + vi->data_idx, size);
+	vi->data_idx += size;
+	vi->data_avail -= size;
+	return size;
+}
+
 static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 {
 	int ret;
@@ -68,17 +82,29 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 	if (vi->hwrng_removed)
 		return -ENODEV;
 
-	if (!vi->busy) {
-		vi->busy = true;
-		reinit_completion(&vi->have_data);
-		register_buffer(vi);
+	read = 0;
+
+	/* copy available data */
+	if (vi->data_avail) {
+		chunk = copy_data(vi, buf, size);
+		size -= chunk;
+		read += chunk;
 	}
 
 	if (!wait)
-		return 0;
+		return read;
 
-	read = 0;
+	/* We have already copied available entropy,
+	 * so either size is 0 or data_avail is 0
+	 */
 	while (size != 0) {
+		/* data_avail is 0 */
+		if (!vi->busy) {
+			/* no pending request, ask for more */
+			vi->busy = true;
+			reinit_completion(&vi->have_data);
+			register_buffer(vi);
+		}
 		ret = wait_for_completion_killable(&vi->have_data);
 		if (ret < 0)
 			return ret;
@@ -88,20 +114,11 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 		if (vi->data_avail == 0)
 			return read;
 
-		chunk = min_t(unsigned int, size, vi->data_avail);
-		memcpy(buf + read, vi->data, chunk);
-		read += chunk;
+		chunk = copy_data(vi, buf + read, size);
 		size -= chunk;
-		vi->data_avail = 0;
-
-		if (size != 0) {
-			reinit_completion(&vi->have_data);
-			register_buffer(vi);
-		}
+		read += chunk;
 	}
 
-	vi->busy = false;
-
 	return read;
 }
 
@@ -161,6 +178,7 @@ static void remove_common(struct virtio_device *vdev)
 
 	vi->hwrng_removed = true;
 	vi->data_avail = 0;
+	vi->data_idx = 0;
 	complete(&vi->have_data);
 	vdev->config->reset(vdev);
 	vi->busy = false;

From 9a4b612d675b03f7fc9fa1957ca399c8223f3954 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <lvivier@redhat.com>
Date: Thu, 28 Oct 2021 12:11:11 +0200
Subject: [PATCH 18/45] hwrng: virtio - always add a pending request

If we ensure we have already some data available by enqueuing
again the buffer once data are exhausted, we can return what we
have without waiting for the device answer.

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
Link: https://lore.kernel.org/r/20211028101111.128049-5-lvivier@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/char/hw_random/virtio-rng.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 8ba97cf4ca8f..0a7dde135db1 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -20,7 +20,6 @@ struct virtrng_info {
 	struct virtqueue *vq;
 	char name[25];
 	int index;
-	bool busy;
 	bool hwrng_register_done;
 	bool hwrng_removed;
 	/* data transfer */
@@ -44,16 +43,18 @@ static void random_recv_done(struct virtqueue *vq)
 		return;
 
 	vi->data_idx = 0;
-	vi->busy = false;
 
 	complete(&vi->have_data);
 }
 
-/* The host will fill any buffer we give it with sweet, sweet randomness. */
-static void register_buffer(struct virtrng_info *vi)
+static void request_entropy(struct virtrng_info *vi)
 {
 	struct scatterlist sg;
 
+	reinit_completion(&vi->have_data);
+	vi->data_avail = 0;
+	vi->data_idx = 0;
+
 	sg_init_one(&sg, vi->data, sizeof(vi->data));
 
 	/* There should always be room for one buffer. */
@@ -69,6 +70,8 @@ static unsigned int copy_data(struct virtrng_info *vi, void *buf,
 	memcpy(buf, vi->data + vi->data_idx, size);
 	vi->data_idx += size;
 	vi->data_avail -= size;
+	if (vi->data_avail == 0)
+		request_entropy(vi);
 	return size;
 }
 
@@ -98,13 +101,7 @@ static int virtio_read(struct hwrng *rng, void *buf, size_t size, bool wait)
 	 * so either size is 0 or data_avail is 0
 	 */
 	while (size != 0) {
-		/* data_avail is 0 */
-		if (!vi->busy) {
-			/* no pending request, ask for more */
-			vi->busy = true;
-			reinit_completion(&vi->have_data);
-			register_buffer(vi);
-		}
+		/* data_avail is 0 but a request is pending */
 		ret = wait_for_completion_killable(&vi->have_data);
 		if (ret < 0)
 			return ret;
@@ -126,8 +123,7 @@ static void virtio_cleanup(struct hwrng *rng)
 {
 	struct virtrng_info *vi = (struct virtrng_info *)rng->priv;
 
-	if (vi->busy)
-		complete(&vi->have_data);
+	complete(&vi->have_data);
 }
 
 static int probe_common(struct virtio_device *vdev)
@@ -163,6 +159,9 @@ static int probe_common(struct virtio_device *vdev)
 		goto err_find;
 	}
 
+	/* we always have a pending entropy request */
+	request_entropy(vi);
+
 	return 0;
 
 err_find:
@@ -181,7 +180,6 @@ static void remove_common(struct virtio_device *vdev)
 	vi->data_idx = 0;
 	complete(&vi->have_data);
 	vdev->config->reset(vdev);
-	vi->busy = false;
 	if (vi->hwrng_register_done)
 		hwrng_unregister(&vi->hwrng);
 	vdev->config->del_vqs(vdev);

From 8d7670f3734eed45bb9d00d8fd1201f662bc667f Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Wed, 20 Oct 2021 19:23:22 +0800
Subject: [PATCH 19/45] virtio_ring: make virtqueue_add_indirect_packed
 prettier

Align the arguments of virtqueue_add_indirect_packed() to the open ( to
make it look prettier.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211020112323.67466-2-xuanzhuo@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 3035bb6f5458..73594c164cbc 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1050,12 +1050,12 @@ static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg,
 }
 
 static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
-				       struct scatterlist *sgs[],
-				       unsigned int total_sg,
-				       unsigned int out_sgs,
-				       unsigned int in_sgs,
-				       void *data,
-				       gfp_t gfp)
+					 struct scatterlist *sgs[],
+					 unsigned int total_sg,
+					 unsigned int out_sgs,
+					 unsigned int in_sgs,
+					 void *data,
+					 gfp_t gfp)
 {
 	struct vring_packed_desc *desc;
 	struct scatterlist *sg;

From fc6d70f40b3d0b3219e2026d05be0409695f620d Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Wed, 20 Oct 2021 19:23:23 +0800
Subject: [PATCH 20/45] virtio_ring: check desc == NULL when using indirect
 with packed

When using indirect with packed, we don't check for allocation failures.
This patch checks that and fall back on direct.

Fixes: 1ce9e6055fa0 ("virtio_ring: introduce packed ring support")
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Link: https://lore.kernel.org/r/20211020112323.67466-3-xuanzhuo@linux.alibaba.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 73594c164cbc..058b18f397cf 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1065,6 +1065,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 
 	head = vq->packed.next_avail_idx;
 	desc = alloc_indirect_packed(total_sg, gfp);
+	if (!desc)
+		return -ENOMEM;
 
 	if (unlikely(vq->vq.num_free < 1)) {
 		pr_debug("Can't add buf len 1 - avail = 0\n");
@@ -1176,6 +1178,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 	unsigned int i, n, c, descs_used, err_idx;
 	__le16 head_flags, flags;
 	u16 head, id, prev, curr, avail_used_flags;
+	int err;
 
 	START_USE(vq);
 
@@ -1191,9 +1194,14 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 
 	BUG_ON(total_sg == 0);
 
-	if (virtqueue_use_indirect(_vq, total_sg))
-		return virtqueue_add_indirect_packed(vq, sgs, total_sg,
-				out_sgs, in_sgs, data, gfp);
+	if (virtqueue_use_indirect(_vq, total_sg)) {
+		err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs,
+						    in_sgs, data, gfp);
+		if (err != -ENOMEM)
+			return err;
+
+		/* fall back on direct */
+	}
 
 	head = vq->packed.next_avail_idx;
 	avail_used_flags = vq->packed.avail_used_flags;

From 601695aa8eaffb8833a2a9971927f7492466f0a5 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 29 Sep 2021 14:15:04 -0500
Subject: [PATCH 21/45] ALSA: virtio: Replace zero-length array with
 flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a regular need in the kernel to provide a way to declare
having a dynamically sized set of trailing elements in a structure.
Kernel code should always use “flexible array members”[1] for these
cases. The older style of one-element or zero-length arrays should
no longer be used[2].

Also, make use of the struct_size() helper in kzalloc().

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/78
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20210929191504.GA337268@embeddedor
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 sound/virtio/virtio_pcm_msg.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sound/virtio/virtio_pcm_msg.c b/sound/virtio/virtio_pcm_msg.c
index f88c8f29cbd8..aca2dc1989ba 100644
--- a/sound/virtio/virtio_pcm_msg.c
+++ b/sound/virtio/virtio_pcm_msg.c
@@ -20,7 +20,7 @@ struct virtio_pcm_msg {
 	struct virtio_snd_pcm_xfer xfer;
 	struct virtio_snd_pcm_status status;
 	size_t length;
-	struct scatterlist sgs[0];
+	struct scatterlist sgs[];
 };
 
 /**
@@ -146,8 +146,7 @@ int virtsnd_pcm_msg_alloc(struct virtio_pcm_substream *vss,
 		int sg_num = virtsnd_pcm_sg_num(data, period_bytes);
 		struct virtio_pcm_msg *msg;
 
-		msg = kzalloc(sizeof(*msg) + sizeof(*msg->sgs) * (sg_num + 2),
-			      GFP_KERNEL);
+		msg = kzalloc(struct_size(msg, sgs, sg_num + 2), GFP_KERNEL);
 		if (!msg)
 			return -ENOMEM;
 

From f1429e6c36f5d12c9ea6edf6d704445fb048e8a6 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pankaj.gupta@ionos.com>
Date: Sat, 16 Oct 2021 11:06:46 +0200
Subject: [PATCH 22/45] virtio-pmem: add myself as virtio-pmem maintainer

Adding myself as virtio-pmem maintainer and also adding virtualization
mailing list entry for virtio specific bits. Helps to get notified for
appropriate bug fixes & enhancements.

Signed-off-by: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Link: https://lore.kernel.org/r/20211016090646.371145-1-pankaj.gupta.linux@gmail.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3b79fd441dde..663ddec86ab7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19952,6 +19952,13 @@ S:	Maintained
 F:	drivers/i2c/busses/i2c-virtio.c
 F:	include/uapi/linux/virtio_i2c.h
 
+VIRTIO PMEM DRIVER
+M:	Pankaj Gupta <pankaj.gupta.linux@gmail.com>
+L:	virtualization@lists.linux-foundation.org
+S:	Maintained
+F:	drivers/nvdimm/virtio_pmem.c
+F:	drivers/nvdimm/nd_virtio.c
+
 VIRTUAL BOX GUEST DEVICE DRIVER
 M:	Hans de Goede <hdegoede@redhat.com>
 M:	Arnd Bergmann <arnd@arndb.de>

From 6ae6ff6f6e7d2f304a12a53af8298e4f16ad633e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:43 +0800
Subject: [PATCH 23/45] virtio-blk: validate num_queues during probe

If an untrusted device neogitates BLK_F_MQ but advertises a zero
num_queues, the driver may end up trying to allocating zero size
buffers where ZERO_SIZE_PTR is returned which may pass the checking
against the NULL. This will lead unexpected results.

Fixing this by failing the probe in this case.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 drivers/block/virtio_blk.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a33fe0743672..dbcf2a7e4a00 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -571,6 +571,10 @@ static int init_vq(struct virtio_blk *vblk)
 				   &num_vqs);
 	if (err)
 		num_vqs = 1;
+	if (!err && !num_vqs) {
+		dev_err(&vdev->dev, "MQ advertisted but zero queues reported\n");
+		return -EINVAL;
+	}
 
 	num_vqs = min_t(unsigned int,
 			min_not_zero(num_request_queues, nr_cpu_ids),

From 63b4ffa4fad0b14e9ebfedd7f7bb709b1c92472f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@googlemail.com>
Date: Mon, 25 Oct 2021 11:22:40 +0100
Subject: [PATCH 24/45] virtio_blk: Fix spelling mistake: "advertisted" ->
 "advertised"

There is a spelling mistake in a dev_err error message. Fix it.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Link: https://lore.kernel.org/r/20211025102240.22801-1-colin.i.king@gmail.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/block/virtio_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dbcf2a7e4a00..20f230bfe5b3 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -572,7 +572,7 @@ static int init_vq(struct virtio_blk *vblk)
 	if (err)
 		num_vqs = 1;
 	if (!err && !num_vqs) {
-		dev_err(&vdev->dev, "MQ advertisted but zero queues reported\n");
+		dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
 		return -EINVAL;
 	}
 

From 28962ec595d701ec9d39369f9774895dfa408273 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:44 +0800
Subject: [PATCH 25/45] virtio_console: validate max_nr_ports before trying to
 use it

We calculate nr_ports based on the max_nr_ports:

nr_queues = use_multiport(portdev) ? (nr_ports + 1) * 2 : 2;

If the device advertises a large max_nr_ports, we will end up with a
integer overflow. Fixing this by validating the max_nr_ports and fail
the probe for invalid max_nr_ports in this case.

Cc: Amit Shah <amit@kernel.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-3-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/char/virtio_console.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 7eaf303a7a86..660c5c388c29 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -28,6 +28,7 @@
 #include "../tty/hvc/hvc_console.h"
 
 #define is_rproc_enabled IS_ENABLED(CONFIG_REMOTEPROC)
+#define VIRTCONS_MAX_PORTS 0x8000
 
 /*
  * This is a global struct for storing common data for all the devices
@@ -2036,6 +2037,14 @@ static int virtcons_probe(struct virtio_device *vdev)
 	    virtio_cread_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT,
 				 struct virtio_console_config, max_nr_ports,
 				 &portdev->max_nr_ports) == 0) {
+		if (portdev->max_nr_ports == 0 ||
+		    portdev->max_nr_ports > VIRTCONS_MAX_PORTS) {
+			dev_err(&vdev->dev,
+				"Invalidate max_nr_ports %d",
+				portdev->max_nr_ports);
+			err = -EINVAL;
+			goto free;
+		}
 		multiport = true;
 	}
 

From d50497eb4e554e1f0351e1836ee7241c059592e6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:45 +0800
Subject: [PATCH 26/45] virtio_config: introduce a new .enable_cbs method

This patch introduces a new method to enable the callbacks for config
and virtqueues. This will be used for making sure the virtqueue
callbacks are only enabled after virtio_device_ready() if transport
implements this method.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-4-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_config.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 8519b3ae5d52..4d107ad31149 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -23,6 +23,8 @@ struct virtio_shm_region {
  *       any of @get/@set, @get_status/@set_status, or @get_features/
  *       @finalize_features are NOT safe to be called from an atomic
  *       context.
+ * @enable_cbs: enable the callbacks
+ *      vdev: the virtio_device
  * @get: read the value of a configuration field
  *	vdev: the virtio_device
  *	offset: the offset of the configuration field
@@ -75,6 +77,7 @@ struct virtio_shm_region {
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
+	void (*enable_cbs)(struct virtio_device *vdev);
 	void (*get)(struct virtio_device *vdev, unsigned offset,
 		    void *buf, unsigned len);
 	void (*set)(struct virtio_device *vdev, unsigned offset,
@@ -229,6 +232,9 @@ void virtio_device_ready(struct virtio_device *dev)
 {
 	unsigned status = dev->config->get_status(dev);
 
+	if (dev->config->enable_cbs)
+                  dev->config->enable_cbs(dev);
+
 	BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
 	dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
 }

From 9e35276a5344f74d4a3600fc4100b3dd251d5c56 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:46 +0800
Subject: [PATCH 27/45] virtio_pci: harden MSI-X interrupts

We used to synchronize pending MSI-X irq handlers via
synchronize_irq(), this may not work for the untrusted device which
may keep sending interrupts after reset which may lead unexpected
results. Similarly, we should not enable MSI-X interrupt until the
device is ready. So this patch fixes those two issues by:

1) switching to use disable_irq() to prevent the virtio interrupt
   handlers to be called after the device is reset.
2) using IRQF_NO_AUTOEN and enable the MSI-X irq during .ready()

This can make sure the virtio interrupt handler won't be called before
virtio_device_ready() and after reset.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-5-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_common.c | 27 +++++++++++++++++++++------
 drivers/virtio/virtio_pci_common.h |  6 ++++--
 drivers/virtio/virtio_pci_legacy.c |  5 +++--
 drivers/virtio/virtio_pci_modern.c |  6 ++++--
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index d724f676608b..3f51fdb7be45 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -24,8 +24,8 @@ MODULE_PARM_DESC(force_legacy,
 		 "Force legacy mode for transitional virtio 1 devices");
 #endif
 
-/* wait for pending irq handlers */
-void vp_synchronize_vectors(struct virtio_device *vdev)
+/* disable irq handlers */
+void vp_disable_cbs(struct virtio_device *vdev)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i;
@@ -34,7 +34,20 @@ void vp_synchronize_vectors(struct virtio_device *vdev)
 		synchronize_irq(vp_dev->pci_dev->irq);
 
 	for (i = 0; i < vp_dev->msix_vectors; ++i)
-		synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i));
+		disable_irq(pci_irq_vector(vp_dev->pci_dev, i));
+}
+
+/* enable irq handlers */
+void vp_enable_cbs(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+	int i;
+
+	if (vp_dev->intx_enabled)
+		return;
+
+	for (i = 0; i < vp_dev->msix_vectors; ++i)
+		enable_irq(pci_irq_vector(vp_dev->pci_dev, i));
 }
 
 /* the notify function used when creating a virt queue */
@@ -141,7 +154,8 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 		 "%s-config", name);
 	err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
-			  vp_config_changed, 0, vp_dev->msix_names[v],
+			  vp_config_changed, IRQF_NO_AUTOEN,
+			  vp_dev->msix_names[v],
 			  vp_dev);
 	if (err)
 		goto error;
@@ -160,7 +174,8 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 		snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
 			 "%s-virtqueues", name);
 		err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
-				  vp_vring_interrupt, 0, vp_dev->msix_names[v],
+				  vp_vring_interrupt, IRQF_NO_AUTOEN,
+				  vp_dev->msix_names[v],
 				  vp_dev);
 		if (err)
 			goto error;
@@ -337,7 +352,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 			 "%s-%s",
 			 dev_name(&vp_dev->vdev.dev), names[i]);
 		err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
-				  vring_interrupt, 0,
+				  vring_interrupt, IRQF_NO_AUTOEN,
 				  vp_dev->msix_names[msix_vec],
 				  vqs[i]);
 		if (err)
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index eb17a29fc7ef..d3c6f72c7390 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -101,8 +101,10 @@ static struct virtio_pci_device *to_vp_device(struct virtio_device *vdev)
 	return container_of(vdev, struct virtio_pci_device, vdev);
 }
 
-/* wait for pending irq handlers */
-void vp_synchronize_vectors(struct virtio_device *vdev);
+/* disable irq handlers */
+void vp_disable_cbs(struct virtio_device *vdev);
+/* enable irq handlers */
+void vp_enable_cbs(struct virtio_device *vdev);
 /* the notify function used when creating a virt queue */
 bool vp_notify(struct virtqueue *vq);
 /* the config->del_vqs() implementation */
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 82eb437ad920..b3f8128b7983 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -98,8 +98,8 @@ static void vp_reset(struct virtio_device *vdev)
 	/* Flush out the status write, and flush in device writes,
 	 * including MSi-X interrupts, if any. */
 	vp_legacy_get_status(&vp_dev->ldev);
-	/* Flush pending VQ/configuration callbacks. */
-	vp_synchronize_vectors(vdev);
+	/* Disable VQ/configuration callbacks. */
+	vp_disable_cbs(vdev);
 }
 
 static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
@@ -185,6 +185,7 @@ static void del_vq(struct virtio_pci_vq_info *info)
 }
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
+	.enable_cbs	= vp_enable_cbs,
 	.get		= vp_get,
 	.set		= vp_set,
 	.get_status	= vp_get_status,
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 30654d3a0b41..5455bc041fb6 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -172,8 +172,8 @@ static void vp_reset(struct virtio_device *vdev)
 	 */
 	while (vp_modern_get_status(mdev))
 		msleep(1);
-	/* Flush pending VQ/configuration callbacks. */
-	vp_synchronize_vectors(vdev);
+	/* Disable VQ/configuration callbacks. */
+	vp_disable_cbs(vdev);
 }
 
 static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector)
@@ -380,6 +380,7 @@ static bool vp_get_shm_region(struct virtio_device *vdev,
 }
 
 static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
+	.enable_cbs	= vp_enable_cbs,
 	.get		= NULL,
 	.set		= NULL,
 	.generation	= vp_generation,
@@ -397,6 +398,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
 };
 
 static const struct virtio_config_ops virtio_pci_config_ops = {
+	.enable_cbs	= vp_enable_cbs,
 	.get		= vp_get,
 	.set		= vp_set,
 	.generation	= vp_generation,

From 080cd7c3ac8701081d143a15ba17dd9475313188 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:47 +0800
Subject: [PATCH 28/45] virtio-pci: harden INTX interrupts

This patch tries to make sure the virtio interrupt handler for INTX
won't be called after a reset and before virtio_device_ready(). We
can't use IRQF_NO_AUTOEN since we're using shared interrupt
(IRQF_SHARED). So this patch tracks the INTX enabling status in a new
intx_soft_enabled variable and toggle it during in
vp_disable/enable_vectors(). The INTX interrupt handler will check
intx_soft_enabled before processing the actual interrupt.

Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-6-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci_common.c | 23 +++++++++++++++++++++--
 drivers/virtio/virtio_pci_common.h |  1 +
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 3f51fdb7be45..fdbde1db5ec5 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -30,8 +30,16 @@ void vp_disable_cbs(struct virtio_device *vdev)
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i;
 
-	if (vp_dev->intx_enabled)
+	if (vp_dev->intx_enabled) {
+		/*
+		 * The below synchronize() guarantees that any
+		 * interrupt for this line arriving after
+		 * synchronize_irq() has completed is guaranteed to see
+		 * intx_soft_enabled == false.
+		 */
+		WRITE_ONCE(vp_dev->intx_soft_enabled, false);
 		synchronize_irq(vp_dev->pci_dev->irq);
+	}
 
 	for (i = 0; i < vp_dev->msix_vectors; ++i)
 		disable_irq(pci_irq_vector(vp_dev->pci_dev, i));
@@ -43,8 +51,16 @@ void vp_enable_cbs(struct virtio_device *vdev)
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i;
 
-	if (vp_dev->intx_enabled)
+	if (vp_dev->intx_enabled) {
+		disable_irq(vp_dev->pci_dev->irq);
+		/*
+		 * The above disable_irq() provides TSO ordering and
+		 * as such promotes the below store to store-release.
+		 */
+		WRITE_ONCE(vp_dev->intx_soft_enabled, true);
+		enable_irq(vp_dev->pci_dev->irq);
 		return;
+	}
 
 	for (i = 0; i < vp_dev->msix_vectors; ++i)
 		enable_irq(pci_irq_vector(vp_dev->pci_dev, i));
@@ -97,6 +113,9 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 	struct virtio_pci_device *vp_dev = opaque;
 	u8 isr;
 
+	if (!READ_ONCE(vp_dev->intx_soft_enabled))
+		return IRQ_NONE;
+
 	/* reading the ISR has the effect of also clearing it so it's very
 	 * important to save off the value. */
 	isr = ioread8(vp_dev->isr);
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index d3c6f72c7390..23f6c5c678d5 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -63,6 +63,7 @@ struct virtio_pci_device {
 	/* MSI-X support */
 	int msix_enabled;
 	int intx_enabled;
+	bool intx_soft_enabled;
 	cpumask_var_t *msix_affinity_masks;
 	/* Name strings for interrupts. This size should be enough,
 	 * and I'm too lazy to allocate each name separately. */

From ef5c366fea30f64d52bb1c4c1e2959a5e6b66e88 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 19 Oct 2021 15:01:48 +0800
Subject: [PATCH 29/45] virtio_ring: fix typos in vring_desc_extra

We're actually tracking descriptor address and length instead of the
buffer.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211019070152.8236-7-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 058b18f397cf..6d2614e34470 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -79,8 +79,8 @@ struct vring_desc_state_packed {
 };
 
 struct vring_desc_extra {
-	dma_addr_t addr;		/* Buffer DMA addr. */
-	u32 len;			/* Buffer length. */
+	dma_addr_t addr;		/* Descriptor DMA addr. */
+	u32 len;			/* Descriptor length. */
 	u16 flags;			/* Descriptor flags. */
 	u16 next;			/* The next desc state in a list. */
 };

From f1aa12f53529ccd67722241792f39248bc89d353 Mon Sep 17 00:00:00 2001
From: Ye Guojin <ye.guojin@zte.com.cn>
Date: Thu, 21 Oct 2021 06:51:11 +0000
Subject: [PATCH 30/45] virtio-blk: fixup coccinelle warnings

coccicheck complains about the use of snprintf() in sysfs show
functions:
WARNING  use scnprintf or sprintf

Use sysfs_emit instead of scnprintf or sprintf makes more sense.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Ye Guojin <ye.guojin@zte.com.cn>
Link: https://lore.kernel.org/r/20211021065111.1047824-1-ye.guojin@zte.com.cn
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/block/virtio_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 20f230bfe5b3..bcb02e4a0809 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -704,7 +704,7 @@ cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
 	u8 writeback = virtblk_get_cache_mode(vblk->vdev);
 
 	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
-	return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
+	return sysfs_emit(buf, "%s\n", virtblk_cache_types[writeback]);
 }
 
 static DEVICE_ATTR_RW(cache_type);

From dcce162559ee1ce5f64992c4c65197f9270e3d4f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 21 Oct 2021 15:17:49 +0530
Subject: [PATCH 31/45] i2c: virtio: Add support for zero-length requests

The virtio specification received a new mandatory feature
(VIRTIO_I2C_F_ZERO_LENGTH_REQUEST) for zero length requests. Fail if the
feature isn't offered by the device.

For each read-request, set the VIRTIO_I2C_FLAGS_M_RD flag, as required
by the VIRTIO_I2C_F_ZERO_LENGTH_REQUEST feature.

This allows us to support zero length requests, like SMBUS Quick, where
the buffer need not be sent anymore.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/7c58868cd26d2fc4bd82d0d8b0dfb55636380110.1634808714.git.viresh.kumar@linaro.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jie Deng <jie.deng@intel.com> # once the spec is merged
---
 drivers/i2c/busses/i2c-virtio.c | 56 ++++++++++++++++++---------------
 include/uapi/linux/virtio_i2c.h |  6 ++++
 2 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/drivers/i2c/busses/i2c-virtio.c b/drivers/i2c/busses/i2c-virtio.c
index f10a603b13fb..1ed4daa918a0 100644
--- a/drivers/i2c/busses/i2c-virtio.c
+++ b/drivers/i2c/busses/i2c-virtio.c
@@ -62,35 +62,33 @@ static int virtio_i2c_prepare_reqs(struct virtqueue *vq,
 	for (i = 0; i < num; i++) {
 		int outcnt = 0, incnt = 0;
 
-		/*
-		 * We don't support 0 length messages and so filter out
-		 * 0 length transfers by using i2c_adapter_quirks.
-		 */
-		if (!msgs[i].len)
-			break;
-
 		/*
 		 * Only 7-bit mode supported for this moment. For the address
 		 * format, Please check the Virtio I2C Specification.
 		 */
 		reqs[i].out_hdr.addr = cpu_to_le16(msgs[i].addr << 1);
 
+		if (msgs[i].flags & I2C_M_RD)
+			reqs[i].out_hdr.flags |= cpu_to_le32(VIRTIO_I2C_FLAGS_M_RD);
+
 		if (i != num - 1)
-			reqs[i].out_hdr.flags = cpu_to_le32(VIRTIO_I2C_FLAGS_FAIL_NEXT);
+			reqs[i].out_hdr.flags |= cpu_to_le32(VIRTIO_I2C_FLAGS_FAIL_NEXT);
 
 		sg_init_one(&out_hdr, &reqs[i].out_hdr, sizeof(reqs[i].out_hdr));
 		sgs[outcnt++] = &out_hdr;
 
-		reqs[i].buf = i2c_get_dma_safe_msg_buf(&msgs[i], 1);
-		if (!reqs[i].buf)
-			break;
+		if (msgs[i].len) {
+			reqs[i].buf = i2c_get_dma_safe_msg_buf(&msgs[i], 1);
+			if (!reqs[i].buf)
+				break;
 
-		sg_init_one(&msg_buf, reqs[i].buf, msgs[i].len);
+			sg_init_one(&msg_buf, reqs[i].buf, msgs[i].len);
 
-		if (msgs[i].flags & I2C_M_RD)
-			sgs[outcnt + incnt++] = &msg_buf;
-		else
-			sgs[outcnt++] = &msg_buf;
+			if (msgs[i].flags & I2C_M_RD)
+				sgs[outcnt + incnt++] = &msg_buf;
+			else
+				sgs[outcnt++] = &msg_buf;
+		}
 
 		sg_init_one(&in_hdr, &reqs[i].in_hdr, sizeof(reqs[i].in_hdr));
 		sgs[outcnt + incnt++] = &in_hdr;
@@ -191,7 +189,7 @@ static int virtio_i2c_setup_vqs(struct virtio_i2c *vi)
 
 static u32 virtio_i2c_func(struct i2c_adapter *adap)
 {
-	return I2C_FUNC_I2C | (I2C_FUNC_SMBUS_EMUL & ~I2C_FUNC_SMBUS_QUICK);
+	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
 }
 
 static struct i2c_algorithm virtio_algorithm = {
@@ -199,15 +197,16 @@ static struct i2c_algorithm virtio_algorithm = {
 	.functionality = virtio_i2c_func,
 };
 
-static const struct i2c_adapter_quirks virtio_i2c_quirks = {
-	.flags = I2C_AQ_NO_ZERO_LEN,
-};
-
 static int virtio_i2c_probe(struct virtio_device *vdev)
 {
 	struct virtio_i2c *vi;
 	int ret;
 
+	if (!virtio_has_feature(vdev, VIRTIO_I2C_F_ZERO_LENGTH_REQUEST)) {
+		dev_err(&vdev->dev, "Zero-length request feature is mandatory\n");
+		return -EINVAL;
+	}
+
 	vi = devm_kzalloc(&vdev->dev, sizeof(*vi), GFP_KERNEL);
 	if (!vi)
 		return -ENOMEM;
@@ -225,7 +224,6 @@ static int virtio_i2c_probe(struct virtio_device *vdev)
 	snprintf(vi->adap.name, sizeof(vi->adap.name),
 		 "i2c_virtio at virtio bus %d", vdev->index);
 	vi->adap.algo = &virtio_algorithm;
-	vi->adap.quirks = &virtio_i2c_quirks;
 	vi->adap.dev.parent = &vdev->dev;
 	vi->adap.dev.of_node = vdev->dev.of_node;
 	i2c_set_adapdata(&vi->adap, vi);
@@ -270,11 +268,17 @@ static int virtio_i2c_restore(struct virtio_device *vdev)
 }
 #endif
 
+static const unsigned int features[] = {
+	VIRTIO_I2C_F_ZERO_LENGTH_REQUEST,
+};
+
 static struct virtio_driver virtio_i2c_driver = {
-	.id_table	= id_table,
-	.probe		= virtio_i2c_probe,
-	.remove		= virtio_i2c_remove,
-	.driver	= {
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.id_table		= id_table,
+	.probe			= virtio_i2c_probe,
+	.remove			= virtio_i2c_remove,
+	.driver			= {
 		.name	= "i2c_virtio",
 	},
 #ifdef CONFIG_PM_SLEEP
diff --git a/include/uapi/linux/virtio_i2c.h b/include/uapi/linux/virtio_i2c.h
index 7c6a6fc01ad6..acf3b6069136 100644
--- a/include/uapi/linux/virtio_i2c.h
+++ b/include/uapi/linux/virtio_i2c.h
@@ -11,9 +11,15 @@
 #include <linux/const.h>
 #include <linux/types.h>
 
+/* Virtio I2C Feature bits */
+#define VIRTIO_I2C_F_ZERO_LENGTH_REQUEST	0
+
 /* The bit 0 of the @virtio_i2c_out_hdr.@flags, used to group the requests */
 #define VIRTIO_I2C_FLAGS_FAIL_NEXT	_BITUL(0)
 
+/* The bit 1 of the @virtio_i2c_out_hdr.@flags, used to mark a buffer as read */
+#define VIRTIO_I2C_FLAGS_M_RD		_BITUL(1)
+
 /**
  * struct virtio_i2c_out_hdr - the virtio I2C message OUT header
  * @addr: the controlled device address

From ead65f76958258c4c349c6a2b9577d80cc23133f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 24 Oct 2021 09:41:40 -0400
Subject: [PATCH 32/45] virtio_blk: allow 0 as num_request_queues

The default value is 0 meaning "no limit". However if 0
is specified on the command line it is instead silently
converted to 1. Further, the value is already validated
at point of use, there's no point in duplicating code
validating the value when it is set.

Simplify the code while making the behaviour more consistent
by using plain module_param.

Fixes: 1a662cf6cb9a ("virtio-blk: add num_request_queues module parameter")
Cc: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index bcb02e4a0809..9dd0099d2bd2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,20 +30,8 @@
 #define VIRTIO_BLK_INLINE_SG_CNT	2
 #endif
 
-static int virtblk_queue_count_set(const char *val,
-		const struct kernel_param *kp)
-{
-	return param_set_uint_minmax(val, kp, 1, nr_cpu_ids);
-}
-
-static const struct kernel_param_ops queue_count_ops = {
-	.set = virtblk_queue_count_set,
-	.get = param_get_uint,
-};
-
 static unsigned int num_request_queues;
-module_param_cb(num_request_queues, &queue_count_ops, &num_request_queues,
-		0644);
+module_param(num_request_queues, uint, 0644);
 MODULE_PARM_DESC(num_request_queues,
 		 "Limit the number of request queues to use for blk device. "
 		 "0 for no limit. "

From f0839372478ec53ff6d728422a6088f8a35b3a28 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 25 Oct 2021 03:54:03 -0400
Subject: [PATCH 33/45] virtio_blk: correct types for status handling

virtblk_setup_cmd returns blk_status_t in an int, callers then assign it
back to a blk_status_t variable. blk_status_t is either u32 or (more
typically) u8 so it works, but is inelegant and causes sparse warnings.

Pass the status in blk_status_t in a consistent way.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: b2c5221fd074 ("virtio-blk: avoid preallocating big SGL for data")
Cc: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/block/virtio_blk.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 9dd0099d2bd2..ec6ed24d1061 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -208,8 +208,9 @@ static void virtblk_cleanup_cmd(struct request *req)
 		kfree(bvec_virt(&req->special_vec));
 }
 
-static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req,
-		struct virtblk_req *vbr)
+static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
+				      struct request *req,
+				      struct virtblk_req *vbr)
 {
 	bool unmap = false;
 	u32 type;
@@ -317,14 +318,15 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 	unsigned long flags;
 	unsigned int num;
 	int qid = hctx->queue_num;
-	int err;
 	bool notify = false;
+	blk_status_t status;
+	int err;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
-	err = virtblk_setup_cmd(vblk->vdev, req, vbr);
-	if (unlikely(err))
-		return err;
+	status = virtblk_setup_cmd(vblk->vdev, req, vbr);
+	if (unlikely(status))
+		return status;
 
 	blk_mq_start_request(req);
 

From 939779f5152d161b34f612af29e7dc1ac4472fcf Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 27 Oct 2021 10:21:04 +0800
Subject: [PATCH 34/45] virtio_ring: validate used buffer length

This patch validate the used buffer length provided by the device
before trying to use it. This is done by record the in buffer length
in a new field in desc_state structure during virtqueue_add(), then we
can fail the virtqueue_get_buf() when we find the device is trying to
give us a used buffer length which is greater than the in buffer
length.

Since some drivers have already done the validation by themselves,
this patch tries to makes the core validation optional. For the driver
that doesn't want the validation, it can set the
suppress_used_validation to be true (which could be overridden by
force_used_validation module parameter). To be more efficient, a
dedicate array is used for storing the validate used length, this
helps to eliminate the cache stress if validation is done by the
driver.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211027022107.14357-2-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 60 ++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h       |  2 ++
 2 files changed, 62 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 6d2614e34470..00f64f2f8b72 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -14,6 +14,9 @@
 #include <linux/spinlock.h>
 #include <xen/xen.h>
 
+static bool force_used_validation = false;
+module_param(force_used_validation, bool, 0444);
+
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt, args...)				\
@@ -182,6 +185,9 @@ struct vring_virtqueue {
 		} packed;
 	};
 
+	/* Per-descriptor in buffer length */
+	u32 *buflen;
+
 	/* How to notify other side. FIXME: commonalize hcalls! */
 	bool (*notify)(struct virtqueue *vq);
 
@@ -490,6 +496,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	unsigned int i, n, avail, descs_used, prev, err_idx;
 	int head;
 	bool indirect;
+	u32 buflen = 0;
 
 	START_USE(vq);
 
@@ -571,6 +578,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 						     VRING_DESC_F_NEXT |
 						     VRING_DESC_F_WRITE,
 						     indirect);
+			buflen += sg->length;
 		}
 	}
 	/* Last one doesn't continue. */
@@ -610,6 +618,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
 	else
 		vq->split.desc_state[head].indir_desc = ctx;
 
+	/* Store in buffer length if necessary */
+	if (vq->buflen)
+		vq->buflen[head] = buflen;
+
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
 	avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1);
@@ -784,6 +796,11 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
 		BAD_RING(vq, "id %u is not a head!\n", i);
 		return NULL;
 	}
+	if (vq->buflen && unlikely(*len > vq->buflen[i])) {
+		BAD_RING(vq, "used len %d is larger than in buflen %u\n",
+			*len, vq->buflen[i]);
+		return NULL;
+	}
 
 	/* detach_buf_split clears data, so grab it now. */
 	ret = vq->split.desc_state[i].data;
@@ -1062,6 +1079,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 	unsigned int i, n, err_idx;
 	u16 head, id;
 	dma_addr_t addr;
+	u32 buflen = 0;
 
 	head = vq->packed.next_avail_idx;
 	desc = alloc_indirect_packed(total_sg, gfp);
@@ -1091,6 +1109,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 			desc[i].addr = cpu_to_le64(addr);
 			desc[i].len = cpu_to_le32(sg->length);
 			i++;
+			if (n >= out_sgs)
+				buflen += sg->length;
 		}
 	}
 
@@ -1144,6 +1164,10 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq,
 	vq->packed.desc_state[id].indir_desc = desc;
 	vq->packed.desc_state[id].last = id;
 
+	/* Store in buffer length if necessary */
+	if (vq->buflen)
+		vq->buflen[id] = buflen;
+
 	vq->num_added += 1;
 
 	pr_debug("Added buffer head %i to %p\n", head, vq);
@@ -1179,6 +1203,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 	__le16 head_flags, flags;
 	u16 head, id, prev, curr, avail_used_flags;
 	int err;
+	u32 buflen = 0;
 
 	START_USE(vq);
 
@@ -1258,6 +1283,8 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 					1 << VRING_PACKED_DESC_F_AVAIL |
 					1 << VRING_PACKED_DESC_F_USED;
 			}
+			if (n >= out_sgs)
+				buflen += sg->length;
 		}
 	}
 
@@ -1277,6 +1304,10 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
 	vq->packed.desc_state[id].indir_desc = ctx;
 	vq->packed.desc_state[id].last = prev;
 
+	/* Store in buffer length if necessary */
+	if (vq->buflen)
+		vq->buflen[id] = buflen;
+
 	/*
 	 * A driver MUST NOT make the first descriptor in the list
 	 * available before all subsequent descriptors comprising
@@ -1463,6 +1494,11 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
 		BAD_RING(vq, "id %u is not a head!\n", id);
 		return NULL;
 	}
+	if (vq->buflen && unlikely(*len > vq->buflen[id])) {
+		BAD_RING(vq, "used len %d is larger than in buflen %u\n",
+			*len, vq->buflen[id]);
+		return NULL;
+	}
 
 	/* detach_buf_packed clears data, so grab it now. */
 	ret = vq->packed.desc_state[id].data;
@@ -1668,6 +1704,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	struct vring_virtqueue *vq;
 	struct vring_packed_desc *ring;
 	struct vring_packed_desc_event *driver, *device;
+	struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver);
 	dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr;
 	size_t ring_size_in_bytes, event_size_in_bytes;
 
@@ -1757,6 +1794,15 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	if (!vq->packed.desc_extra)
 		goto err_desc_extra;
 
+	if (!drv->suppress_used_validation || force_used_validation) {
+		vq->buflen = kmalloc_array(num, sizeof(*vq->buflen),
+					   GFP_KERNEL);
+		if (!vq->buflen)
+			goto err_buflen;
+	} else {
+		vq->buflen = NULL;
+	}
+
 	/* No callback?  Tell other side not to bother us. */
 	if (!callback) {
 		vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
@@ -1769,6 +1815,8 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	spin_unlock(&vdev->vqs_list_lock);
 	return &vq->vq;
 
+err_buflen:
+	kfree(vq->packed.desc_extra);
 err_desc_extra:
 	kfree(vq->packed.desc_state);
 err_desc_state:
@@ -2176,6 +2224,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					void (*callback)(struct virtqueue *),
 					const char *name)
 {
+	struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver);
 	struct vring_virtqueue *vq;
 
 	if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED))
@@ -2235,6 +2284,15 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	if (!vq->split.desc_extra)
 		goto err_extra;
 
+	if (!drv->suppress_used_validation || force_used_validation) {
+		vq->buflen = kmalloc_array(vring.num, sizeof(*vq->buflen),
+					   GFP_KERNEL);
+		if (!vq->buflen)
+			goto err_buflen;
+	} else {
+		vq->buflen = NULL;
+	}
+
 	/* Put everything in free lists. */
 	vq->free_head = 0;
 	memset(vq->split.desc_state, 0, vring.num *
@@ -2245,6 +2303,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	spin_unlock(&vdev->vqs_list_lock);
 	return &vq->vq;
 
+err_buflen:
+	kfree(vq->split.desc_extra);
 err_extra:
 	kfree(vq->split.desc_state);
 err_state:
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 41edbc01ffa4..44d0e09da2d9 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -152,6 +152,7 @@ size_t virtio_max_dma_size(struct virtio_device *vdev);
  * @feature_table_size: number of entries in the feature table array.
  * @feature_table_legacy: same as feature_table but when working in legacy mode.
  * @feature_table_size_legacy: number of entries in feature table legacy array.
+ * @suppress_used_validation: set to not have core validate used length
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
  * @scan: optional function to call after successful probe; intended
  *    for virtio-scsi to invoke a scan.
@@ -168,6 +169,7 @@ struct virtio_driver {
 	unsigned int feature_table_size;
 	const unsigned int *feature_table_legacy;
 	unsigned int feature_table_size_legacy;
+	bool suppress_used_validation;
 	int (*validate)(struct virtio_device *dev);
 	int (*probe)(struct virtio_device *dev);
 	void (*scan)(struct virtio_device *dev);

From 816625c13652cef5b2c49082d652875da6f2ad7a Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 27 Oct 2021 10:21:05 +0800
Subject: [PATCH 35/45] virtio-net: don't let virtio core to validate used
 length

For RX virtuqueue, the used length is validated in all the three paths
(big, small and mergeable). For control vq, we never tries to use used
length. So this patch forbids the core to validate the used length.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211027022107.14357-3-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/net/virtio_net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6d8c8745bf24..7c43bfc1ce44 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3385,6 +3385,7 @@ static struct virtio_driver virtio_net_driver = {
 	.feature_table_size = ARRAY_SIZE(features),
 	.feature_table_legacy = features_legacy,
 	.feature_table_size_legacy = ARRAY_SIZE(features_legacy),
+	.suppress_used_validation = true,
 	.driver.name =	KBUILD_MODNAME,
 	.driver.owner =	THIS_MODULE,
 	.id_table =	id_table,

From a40392edf1b2c7822bc0ce68413106661a9d4232 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 27 Oct 2021 10:21:06 +0800
Subject: [PATCH 36/45] virtio-blk: don't let virtio core to validate used
 length

We never tries to use used length, so the patch prevents the virtio
core from validating used length.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211027022107.14357-4-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index ec6ed24d1061..e1f4f63ed246 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1041,6 +1041,7 @@ static struct virtio_driver virtio_blk = {
 	.feature_table_size		= ARRAY_SIZE(features),
 	.feature_table_legacy		= features_legacy,
 	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
+	.suppress_used_validation	= true,
 	.driver.name			= KBUILD_MODNAME,
 	.driver.owner			= THIS_MODULE,
 	.id_table			= id_table,

From c57911ebfbfe745cb95da2bcf547c5bae000590f Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 27 Oct 2021 10:21:07 +0800
Subject: [PATCH 37/45] virtio-scsi: don't let virtio core to validate used
 buffer length

We never tries to use used length, so the patch prevents the virtio
core from validating used length.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211027022107.14357-5-jasowang@redhat.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/scsi/virtio_scsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 07d0250f17c3..03b09ecea42d 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -977,6 +977,7 @@ static unsigned int features[] = {
 static struct virtio_driver virtio_scsi_driver = {
 	.feature_table = features,
 	.feature_table_size = ARRAY_SIZE(features),
+	.suppress_used_validation = true,
 	.driver.name = KBUILD_MODNAME,
 	.driver.owner = THIS_MODULE,
 	.id_table = id_table,

From 6dbb1f1687a2ccdfc5b84b0a35bbc6dfefc4de3b Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:12 +0300
Subject: [PATCH 38/45] vdpa: Introduce and use vdpa device get, set config
 helpers

Subsequent patches enable get and set configuration either
via management device or via vdpa device' config ops.

This requires synchronization between multiple callers to get and set
config callbacks. Features setting also influence the layout of the
configuration fields endianness.

To avoid exposing synchronization primitives to callers, introduce
helper for setting the configuration and use it.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-2-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c          | 36 ++++++++++++++++++++++++++++++++++++
 drivers/vhost/vdpa.c         |  3 +--
 drivers/virtio/virtio_vdpa.c |  3 +--
 include/linux/vdpa.h         | 19 ++++---------------
 4 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index fcf02a364878..cbc8fc69cf9b 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -297,6 +297,42 @@ void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev)
 }
 EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister);
 
+/**
+ * vdpa_get_config - Get one or more device configuration fields.
+ * @vdev: vdpa device to operate on
+ * @offset: starting byte offset of the field
+ * @buf: buffer pointer to read to
+ * @len: length of the configuration fields in bytes
+ */
+void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
+		     void *buf, unsigned int len)
+{
+	const struct vdpa_config_ops *ops = vdev->config;
+
+	/*
+	 * Config accesses aren't supposed to trigger before features are set.
+	 * If it does happen we assume a legacy guest.
+	 */
+	if (!vdev->features_valid)
+		vdpa_set_features(vdev, 0);
+	ops->get_config(vdev, offset, buf, len);
+}
+EXPORT_SYMBOL_GPL(vdpa_get_config);
+
+/**
+ * vdpa_set_config - Set one or more device configuration fields.
+ * @vdev: vdpa device to operate on
+ * @offset: starting byte offset of the field
+ * @buf: buffer pointer to read from
+ * @length: length of the configuration fields in bytes
+ */
+void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
+		     const void *buf, unsigned int length)
+{
+	vdev->config->set_config(vdev, offset, buf, length);
+}
+EXPORT_SYMBOL_GPL(vdpa_set_config);
+
 static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev,
 				 const char *busname, const char *devname)
 {
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 39039e046117..01c59ce7e250 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -237,7 +237,6 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
 				  struct vhost_vdpa_config __user *c)
 {
 	struct vdpa_device *vdpa = v->vdpa;
-	const struct vdpa_config_ops *ops = vdpa->config;
 	struct vhost_vdpa_config config;
 	unsigned long size = offsetof(struct vhost_vdpa_config, buf);
 	u8 *buf;
@@ -251,7 +250,7 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
 	if (IS_ERR(buf))
 		return PTR_ERR(buf);
 
-	ops->set_config(vdpa, config.off, buf, config.len);
+	vdpa_set_config(vdpa, config.off, buf, config.len);
 
 	kvfree(buf);
 	return 0;
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 6b62aaf08cc5..f85f860bc10b 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -65,9 +65,8 @@ static void virtio_vdpa_set(struct virtio_device *vdev, unsigned offset,
 			    const void *buf, unsigned len)
 {
 	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
-	const struct vdpa_config_ops *ops = vdpa->config;
 
-	ops->set_config(vdpa, offset, buf, len);
+	vdpa_set_config(vdpa, offset, buf, len);
 }
 
 static u32 virtio_vdpa_generation(struct virtio_device *vdev)
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 30864848950b..267236aab34c 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -386,21 +386,10 @@ static inline int vdpa_set_features(struct vdpa_device *vdev, u64 features)
 	return ops->set_features(vdev, features);
 }
 
-static inline void vdpa_get_config(struct vdpa_device *vdev,
-				   unsigned int offset, void *buf,
-				   unsigned int len)
-{
-	const struct vdpa_config_ops *ops = vdev->config;
-
-	/*
-	 * Config accesses aren't supposed to trigger before features are set.
-	 * If it does happen we assume a legacy guest.
-	 */
-	if (!vdev->features_valid)
-		vdpa_set_features(vdev, 0);
-	ops->get_config(vdev, offset, buf, len);
-}
-
+void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
+		     void *buf, unsigned int len);
+void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
+		     const void *buf, unsigned int length);
 /**
  * struct vdpa_mgmtdev_ops - vdpa device ops
  * @dev_add: Add a vdpa device using alloc and register

From ad69dd0bf26b88ec6ab26f8bbe5cd74fbed7672a Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:13 +0300
Subject: [PATCH 39/45] vdpa: Introduce query of device config layout

Introduce a command to query a device config layout.

An example query of network vdpa device:

$ vdpa dev add name bar mgmtdev vdpasim_net

$ vdpa dev config show
bar: mac 00:35:09:19:48:05 link up link_announce false mtu 1500

$ vdpa dev config show -jp
{
    "config": {
        "bar": {
            "mac": "00:35:09:19:48:05",
            "link ": "up",
            "link_announce ": false,
            "mtu": 1500,
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-3-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa.c       | 176 ++++++++++++++++++++++++++++++++++++++
 include/linux/vdpa.h      |   2 +
 include/uapi/linux/vdpa.h |   6 ++
 3 files changed, 184 insertions(+)

diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index cbc8fc69cf9b..8fcbdda8590c 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -14,6 +14,8 @@
 #include <uapi/linux/vdpa.h>
 #include <net/genetlink.h>
 #include <linux/mod_devicetable.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_ids.h>
 
 static LIST_HEAD(mdev_head);
 /* A global mutex that protects vdpa management device and device level operations. */
@@ -66,6 +68,7 @@ static void vdpa_release_dev(struct device *d)
 		ops->free(vdev);
 
 	ida_simple_remove(&vdpa_index_ida, vdev->index);
+	mutex_destroy(&vdev->cf_mutex);
 	kfree(vdev);
 }
 
@@ -127,6 +130,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 	if (err)
 		goto err_name;
 
+	mutex_init(&vdev->cf_mutex);
 	device_initialize(&vdev->dev);
 
 	return vdev;
@@ -309,6 +313,7 @@ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
 {
 	const struct vdpa_config_ops *ops = vdev->config;
 
+	mutex_lock(&vdev->cf_mutex);
 	/*
 	 * Config accesses aren't supposed to trigger before features are set.
 	 * If it does happen we assume a legacy guest.
@@ -316,6 +321,7 @@ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset,
 	if (!vdev->features_valid)
 		vdpa_set_features(vdev, 0);
 	ops->get_config(vdev, offset, buf, len);
+	mutex_unlock(&vdev->cf_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_get_config);
 
@@ -329,7 +335,9 @@ EXPORT_SYMBOL_GPL(vdpa_get_config);
 void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset,
 		     const void *buf, unsigned int length)
 {
+	mutex_lock(&vdev->cf_mutex);
 	vdev->config->set_config(vdev, offset, buf, length);
+	mutex_unlock(&vdev->cf_mutex);
 }
 EXPORT_SYMBOL_GPL(vdpa_set_config);
 
@@ -661,6 +669,168 @@ static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callba
 	return msg->len;
 }
 
+static int vdpa_dev_net_mq_config_fill(struct vdpa_device *vdev,
+				       struct sk_buff *msg, u64 features,
+				       const struct virtio_net_config *config)
+{
+	u16 val_u16;
+
+	if ((features & (1ULL << VIRTIO_NET_F_MQ)) == 0)
+		return 0;
+
+	val_u16 = le16_to_cpu(config->max_virtqueue_pairs);
+	return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16);
+}
+
+static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg)
+{
+	struct virtio_net_config config = {};
+	u64 features;
+	u16 val_u16;
+
+	vdpa_get_config(vdev, 0, &config, sizeof(config));
+
+	if (nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config.mac),
+		    config.mac))
+		return -EMSGSIZE;
+
+	val_u16 = le16_to_cpu(config.status);
+	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16))
+		return -EMSGSIZE;
+
+	val_u16 = le16_to_cpu(config.mtu);
+	if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16))
+		return -EMSGSIZE;
+
+	features = vdev->config->get_features(vdev);
+
+	return vdpa_dev_net_mq_config_fill(vdev, msg, features, &config);
+}
+
+static int
+vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq,
+		     int flags, struct netlink_ext_ack *extack)
+{
+	u32 device_id;
+	void *hdr;
+	int err;
+
+	hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags,
+			  VDPA_CMD_DEV_CONFIG_GET);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) {
+		err = -EMSGSIZE;
+		goto msg_err;
+	}
+
+	device_id = vdev->config->get_device_id(vdev);
+	if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) {
+		err = -EMSGSIZE;
+		goto msg_err;
+	}
+
+	switch (device_id) {
+	case VIRTIO_ID_NET:
+		err = vdpa_dev_net_config_fill(vdev, msg);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+	if (err)
+		goto msg_err;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+msg_err:
+	genlmsg_cancel(msg, hdr);
+	return err;
+}
+
+static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct vdpa_device *vdev;
+	struct sk_buff *msg;
+	const char *devname;
+	struct device *dev;
+	int err;
+
+	if (!info->attrs[VDPA_ATTR_DEV_NAME])
+		return -EINVAL;
+	devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	mutex_lock(&vdpa_dev_mutex);
+	dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match);
+	if (!dev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "device not found");
+		err = -ENODEV;
+		goto dev_err;
+	}
+	vdev = container_of(dev, struct vdpa_device, dev);
+	if (!vdev->mdev) {
+		NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device");
+		err = -EINVAL;
+		goto mdev_err;
+	}
+	err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq,
+				   0, info->extack);
+	if (!err)
+		err = genlmsg_reply(msg, info);
+
+mdev_err:
+	put_device(dev);
+dev_err:
+	mutex_unlock(&vdpa_dev_mutex);
+	if (err)
+		nlmsg_free(msg);
+	return err;
+}
+
+static int vdpa_dev_config_dump(struct device *dev, void *data)
+{
+	struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev);
+	struct vdpa_dev_dump_info *info = data;
+	int err;
+
+	if (!vdev->mdev)
+		return 0;
+	if (info->idx < info->start_idx) {
+		info->idx++;
+		return 0;
+	}
+	err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid,
+				   info->cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				   info->cb->extack);
+	if (err)
+		return err;
+
+	info->idx++;
+	return 0;
+}
+
+static int
+vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct vdpa_dev_dump_info info;
+
+	info.msg = msg;
+	info.cb = cb;
+	info.start_idx = cb->args[0];
+	info.idx = 0;
+
+	mutex_lock(&vdpa_dev_mutex);
+	bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump);
+	mutex_unlock(&vdpa_dev_mutex);
+	cb->args[0] = info.idx;
+	return msg->len;
+}
+
 static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
@@ -692,6 +862,12 @@ static const struct genl_ops vdpa_nl_ops[] = {
 		.doit = vdpa_nl_cmd_dev_get_doit,
 		.dumpit = vdpa_nl_cmd_dev_get_dumpit,
 	},
+	{
+		.cmd = VDPA_CMD_DEV_CONFIG_GET,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit = vdpa_nl_cmd_dev_config_get_doit,
+		.dumpit = vdpa_nl_cmd_dev_config_get_dumpit,
+	},
 };
 
 static struct genl_family vdpa_nl_family __ro_after_init = {
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 267236aab34c..5cc5e501397f 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -63,6 +63,7 @@ struct vdpa_mgmt_dev;
  * @dev: underlying device
  * @dma_dev: the actual device that is performing DMA
  * @config: the configuration ops for this device.
+ * @cf_mutex: Protects get and set access to configuration layout.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
  * @use_va: indicate whether virtual address must be used by this device
@@ -74,6 +75,7 @@ struct vdpa_device {
 	struct device dev;
 	struct device *dma_dev;
 	const struct vdpa_config_ops *config;
+	struct mutex cf_mutex; /* Protects get/set config */
 	unsigned int index;
 	bool features_valid;
 	bool use_va;
diff --git a/include/uapi/linux/vdpa.h b/include/uapi/linux/vdpa.h
index e3b87879514c..a252f06f9dfd 100644
--- a/include/uapi/linux/vdpa.h
+++ b/include/uapi/linux/vdpa.h
@@ -17,6 +17,7 @@ enum vdpa_command {
 	VDPA_CMD_DEV_NEW,
 	VDPA_CMD_DEV_DEL,
 	VDPA_CMD_DEV_GET,		/* can dump */
+	VDPA_CMD_DEV_CONFIG_GET,	/* can dump */
 };
 
 enum vdpa_attr {
@@ -34,6 +35,11 @@ enum vdpa_attr {
 	VDPA_ATTR_DEV_MAX_VQ_SIZE,		/* u16 */
 	VDPA_ATTR_DEV_MIN_VQ_SIZE,		/* u16 */
 
+	VDPA_ATTR_DEV_NET_CFG_MACADDR,		/* binary */
+	VDPA_ATTR_DEV_NET_STATUS,		/* u8 */
+	VDPA_ATTR_DEV_NET_CFG_MAX_VQP,		/* u16 */
+	VDPA_ATTR_DEV_NET_CFG_MTU,		/* u16 */
+
 	/* new attributes must be added above here */
 	VDPA_ATTR_MAX,
 };

From 960deb33be3d08e55a39e40e0286a51c7448e053 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:14 +0300
Subject: [PATCH 40/45] vdpa: Use kernel coding style for structure comments

As subsequent patch adds new structure field with comment, move the
structure comment to follow kernel coding style.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-4-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 5cc5e501397f..fafb7202482c 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -411,10 +411,17 @@ struct vdpa_mgmtdev_ops {
 	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
 };
 
+/**
+ * struct vdpa_mgmt_dev - vdpa management device
+ * @device: Management parent device
+ * @ops: operations supported by management device
+ * @id_table: Pointer to device id table of supported ids
+ * @list: list entry
+ */
 struct vdpa_mgmt_dev {
 	struct device *device;
 	const struct vdpa_mgmtdev_ops *ops;
-	const struct virtio_device_id *id_table; /* supported ids */
+	const struct virtio_device_id *id_table;
 	struct list_head list;
 };
 

From d8ca2fa5be1bdb9d08cfe1f831cddb622a01dfd4 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:15 +0300
Subject: [PATCH 41/45] vdpa: Enable user to set mac and mtu of vdpa device

$ vdpa dev add name bar mgmtdev vdpasim_net mac 00:11:22:33:44:55 mtu 9000

$ vdpa dev config show
bar: mac 00:11:22:33:44:55 link up link_announce false mtu 9000

$ vdpa dev config show -jp
{
    "config": {
        "bar": {
            "mac": "00:11:22:33:44:55",
            "link ": "up",
            "link_announce ": false,
            "mtu": 9000,
        }
    }
}

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>

Link: https://lore.kernel.org/r/20211026175519.87795-5-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 drivers/vdpa/ifcvf/ifcvf_main.c      |  3 ++-
 drivers/vdpa/mlx5/net/mlx5_vnet.c    |  3 ++-
 drivers/vdpa/vdpa.c                  | 38 ++++++++++++++++++++++++++--
 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c |  3 ++-
 drivers/vdpa/vdpa_sim/vdpa_sim_net.c |  3 ++-
 drivers/vdpa/vdpa_user/vduse_dev.c   |  3 ++-
 include/linux/vdpa.h                 | 17 ++++++++++++-
 7 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index dcd648e1f7e7..6dc75ca70b37 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -499,7 +499,8 @@ static u32 get_dev_type(struct pci_dev *pdev)
 	return dev_type;
 }
 
-static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+			      const struct vdpa_dev_set_config *config)
 {
 	struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev;
 	struct ifcvf_adapter *adapter;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index b5bd1a553256..6bbdc0ece707 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2482,7 +2482,8 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p
 	return ret;
 }
 
-static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
+static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
+			     const struct vdpa_dev_set_config *add_config)
 {
 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
 	struct virtio_net_config *config;
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 8fcbdda8590c..7332a74a4b00 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -14,7 +14,6 @@
 #include <uapi/linux/vdpa.h>
 #include <net/genetlink.h>
 #include <linux/mod_devicetable.h>
-#include <linux/virtio_net.h>
 #include <linux/virtio_ids.h>
 
 static LIST_HEAD(mdev_head);
@@ -480,9 +479,15 @@ out:
 	return msg->len;
 }
 
+#define VDPA_DEV_NET_ATTRS_MASK ((1 << VDPA_ATTR_DEV_NET_CFG_MACADDR) | \
+				 (1 << VDPA_ATTR_DEV_NET_CFG_MTU))
+
 static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info)
 {
+	struct vdpa_dev_set_config config = {};
+	struct nlattr **nl_attrs = info->attrs;
 	struct vdpa_mgmt_dev *mdev;
+	const u8 *macaddr;
 	const char *name;
 	int err = 0;
 
@@ -491,6 +496,26 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i
 
 	name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]);
 
+	if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) {
+		macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]);
+		memcpy(config.net.mac, macaddr, sizeof(config.net.mac));
+		config.mask |= (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR);
+	}
+	if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) {
+		config.net.mtu =
+			nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]);
+		config.mask |= (1 << VDPA_ATTR_DEV_NET_CFG_MTU);
+	}
+
+	/* Skip checking capability if user didn't prefer to configure any
+	 * device networking attributes. It is likely that user might have used
+	 * a device specific method to configure such attributes or using device
+	 * default attributes.
+	 */
+	if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) &&
+	    !netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
 	mutex_lock(&vdpa_dev_mutex);
 	mdev = vdpa_mgmtdev_get_from_attr(info->attrs);
 	if (IS_ERR(mdev)) {
@@ -498,8 +523,14 @@ static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *i
 		err = PTR_ERR(mdev);
 		goto err;
 	}
+	if ((config.mask & mdev->config_attr_mask) != config.mask) {
+		NL_SET_ERR_MSG_MOD(info->extack,
+				   "All provided attributes are not supported");
+		err = -EOPNOTSUPP;
+		goto err;
+	}
 
-	err = mdev->ops->dev_add(mdev, name);
+	err = mdev->ops->dev_add(mdev, name, &config);
 err:
 	mutex_unlock(&vdpa_dev_mutex);
 	return err;
@@ -835,6 +866,9 @@ static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = {
 	[VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING },
 	[VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING },
+	[VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR,
+	/* virtio spec 1.1 section 5.1.4.1 for valid MTU range */
+	[VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68),
 };
 
 static const struct genl_ops vdpa_nl_ops[] = {
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index a790903f243e..42d401d43911 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -248,7 +248,8 @@ static struct device vdpasim_blk_mgmtdev = {
 	.release = vdpasim_blk_mgmtdev_release,
 };
 
-static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+			       const struct vdpa_dev_set_config *config)
 {
 	struct vdpasim_dev_attr dev_attr = {};
 	struct vdpasim *simdev;
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
index a1ab6163f7d1..d681e423e64f 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -126,7 +126,8 @@ static struct device vdpasim_net_mgmtdev = {
 	.release = vdpasim_net_mgmtdev_release,
 };
 
-static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+			       const struct vdpa_dev_set_config *config)
 {
 	struct vdpasim_dev_attr dev_attr = {};
 	struct vdpasim *simdev;
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 841667a896dd..c9204c62f339 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1503,7 +1503,8 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name)
 	return 0;
 }
 
-static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
+static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
+			const struct vdpa_dev_set_config *config)
 {
 	struct vduse_dev *dev;
 	int ret;
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index fafb7202482c..c3011ccda430 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -6,6 +6,8 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/vhost_iotlb.h>
+#include <linux/virtio_net.h>
+#include <linux/if_ether.h>
 
 /**
  * struct vdpa_calllback - vDPA callback definition.
@@ -93,6 +95,14 @@ struct vdpa_iova_range {
 	u64 last;
 };
 
+struct vdpa_dev_set_config {
+	struct {
+		u8 mac[ETH_ALEN];
+		u16 mtu;
+	} net;
+	u64 mask;
+};
+
 /**
  * Corresponding file area for device memory mapping
  * @file: vma->vm_file for the mapping
@@ -397,6 +407,7 @@ void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
  * @dev_add: Add a vdpa device using alloc and register
  *	     @mdev: parent device to use for device addition
  *	     @name: name of the new vdpa device
+ *	     @config: config attributes to apply to the device under creation
  *	     Driver need to add a new device using _vdpa_register_device()
  *	     after fully initializing the vdpa device. Driver must return 0
  *	     on success or appropriate error code.
@@ -407,7 +418,8 @@ void vdpa_set_config(struct vdpa_device *dev, unsigned int offset,
  *	     _vdpa_unregister_device().
  */
 struct vdpa_mgmtdev_ops {
-	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name);
+	int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name,
+		       const struct vdpa_dev_set_config *config);
 	void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev);
 };
 
@@ -416,12 +428,15 @@ struct vdpa_mgmtdev_ops {
  * @device: Management parent device
  * @ops: operations supported by management device
  * @id_table: Pointer to device id table of supported ids
+ * @config_attr_mask: bit mask of attributes of type enum vdpa_attr that
+ *		      management device support during dev_add callback
  * @list: list entry
  */
 struct vdpa_mgmt_dev {
 	struct device *device;
 	const struct vdpa_mgmtdev_ops *ops;
 	const struct virtio_device_id *id_table;
+	u64 config_attr_mask;
 	struct list_head list;
 };
 

From 1138b9818efa8a3a9670680a1d75134a4f2758ce Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:16 +0300
Subject: [PATCH 42/45] vdpa_sim_net: Enable user to set mac address and mtu

Enable user to set the mac address and mtu so that each vdpa device
can have its own user specified mac address and mtu.

Now that user is enabled to set the mac address, remove the module
parameter for same.

And example of setting mac addr and mtu and view the configuration:
$ vdpa mgmtdev show
vdpasim_net:
  supported_classes net

$ vdpa dev add name bar mgmtdev vdpasim_net mac 00:11:22:33:44:55 mtu 9000

$ vdpa dev config show
bar: mac 00:11:22:33:44:55 link up link_announce false mtu 9000

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-6-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 35 +++++++++++++++-------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
index d681e423e64f..76dd24abc791 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c
@@ -16,6 +16,7 @@
 #include <linux/vringh.h>
 #include <linux/vdpa.h>
 #include <uapi/linux/virtio_net.h>
+#include <uapi/linux/vdpa.h>
 
 #include "vdpa_sim.h"
 
@@ -29,12 +30,6 @@
 
 #define VDPASIM_NET_VQ_NUM	2
 
-static char *macaddr;
-module_param(macaddr, charp, 0);
-MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
-
-static u8 macaddr_buf[ETH_ALEN];
-
 static void vdpasim_net_work(struct work_struct *work)
 {
 	struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
@@ -112,9 +107,21 @@ static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
 {
 	struct virtio_net_config *net_config = config;
 
-	net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
 	net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
-	memcpy(net_config->mac, macaddr_buf, ETH_ALEN);
+}
+
+static void vdpasim_net_setup_config(struct vdpasim *vdpasim,
+				     const struct vdpa_dev_set_config *config)
+{
+	struct virtio_net_config *vio_config = vdpasim->config;
+
+	if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR))
+		memcpy(vio_config->mac, config->net.mac, ETH_ALEN);
+	if (config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MTU))
+		vio_config->mtu = cpu_to_vdpasim16(vdpasim, config->net.mtu);
+	else
+		/* Setup default MTU to be 1500 */
+		vio_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
 }
 
 static void vdpasim_net_mgmtdev_release(struct device *dev)
@@ -147,6 +154,8 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
 	if (IS_ERR(simdev))
 		return PTR_ERR(simdev);
 
+	vdpasim_net_setup_config(simdev, config);
+
 	ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_NET_VQ_NUM);
 	if (ret)
 		goto reg_err;
@@ -180,20 +189,14 @@ static struct vdpa_mgmt_dev mgmt_dev = {
 	.device = &vdpasim_net_mgmtdev,
 	.id_table = id_table,
 	.ops = &vdpasim_net_mgmtdev_ops,
+	.config_attr_mask = (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR |
+			     1 << VDPA_ATTR_DEV_NET_CFG_MTU),
 };
 
 static int __init vdpasim_net_init(void)
 {
 	int ret;
 
-	if (macaddr) {
-		mac_pton(macaddr, macaddr_buf);
-		if (!is_valid_ether_addr(macaddr_buf))
-			return -EADDRNOTAVAIL;
-	} else {
-		eth_random_addr(macaddr_buf);
-	}
-
 	ret = device_register(&vdpasim_net_mgmtdev);
 	if (ret)
 		return ret;

From ef76eb83a17e803a66b64ac95b36ae48b3d17c22 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:17 +0300
Subject: [PATCH 43/45] vdpa/mlx5: Fix clearing of VIRTIO_NET_F_MAC feature bit

Cited patch in the fixes tag clears the features bit during reset.
mlx5 vdpa device feature bits are static decided by device capabilities.
These feature bits (including VIRTIO_NET_F_MAC) are initialized during
device addition time.

Clearing features bit in reset callback cleared the VIRTIO_NET_F_MAC. Due
to this, MAC address provided by the device is not honored.

Fix it by not clearing the static feature bits during reset.

Fixes: 0686082dbf7a ("vdpa: Add reset callback in vdpa_config_ops")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20211026175519.87795-7-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 6bbdc0ece707..8d1539728a59 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2194,7 +2194,6 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev)
 	clear_vqs_ready(ndev);
 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
 	ndev->mvdev.status = 0;
-	ndev->mvdev.mlx_features = 0;
 	memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs));
 	ndev->mvdev.actual_features = 0;
 	++mvdev->generation;

From a007d940040c0b3d1fcb5f39159c2b141b85eae0 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:18 +0300
Subject: [PATCH 44/45] vdpa/mlx5: Support configuration of MAC

Add code to accept MAC configuration through vdpa tool. The MAC is
written into the config struct and later can be retrieved through
get_config().

Examples:
1. Configure MAC while adding a device:
$ vdpa dev add mgmtdev pci/0000:06:00.2 name vdpa0 mac 00:11:22:33:44:55

2. Show configured params:
$ vdpa dev config show
vdpa0: mac 00:11:22:33:44:55 link down link_announce false max_vq_pairs 8 mtu 1500

Signed-off-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20211026175519.87795-8-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 8d1539728a59..cc027499c4d3 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -6,6 +6,7 @@
 #include <linux/vringh.h>
 #include <uapi/linux/virtio_net.h>
 #include <uapi/linux/virtio_ids.h>
+#include <uapi/linux/vdpa.h>
 #include <linux/virtio_config.h>
 #include <linux/auxiliary_bus.h>
 #include <linux/mlx5/cq.h>
@@ -2524,17 +2525,20 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
 		goto err_mtu;
 
 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
-	ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
-
-	err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
-	if (err)
-		goto err_mtu;
 
 	if (get_link_state(mvdev))
 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
 	else
 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
 
+	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
+		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
+	} else {
+		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
+		if (err)
+			goto err_mtu;
+	}
+
 	if (!is_zero_ether_addr(config->mac)) {
 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
@@ -2632,6 +2636,7 @@ static int mlx5v_probe(struct auxiliary_device *adev,
 	mgtdev->mgtdev.ops = &mdev_ops;
 	mgtdev->mgtdev.device = mdev->device;
 	mgtdev->mgtdev.id_table = id_table;
+	mgtdev->mgtdev.config_attr_mask = (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR);
 	mgtdev->madev = madev;
 
 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);

From 540061ac79f0302ae91e44e6cd216cbaa3af1757 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 26 Oct 2021 20:55:19 +0300
Subject: [PATCH 45/45] vdpa/mlx5: Forward only packets with allowed MAC
 address

Add rules to forward packets to the net device's TIR only if the
destination MAC is equal to the configured MAC. This is required to
prevent the netdevice from receiving traffic not destined to its
configured MAC.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Link: https://lore.kernel.org/r/20211026175519.87795-9-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 76 +++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index cc027499c4d3..9b7d8c721354 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -158,7 +158,8 @@ struct mlx5_vdpa_net {
 	struct mutex reslock;
 	struct mlx5_flow_table *rxft;
 	struct mlx5_fc *rx_counter;
-	struct mlx5_flow_handle *rx_rule;
+	struct mlx5_flow_handle *rx_rule_ucast;
+	struct mlx5_flow_handle *rx_rule_mcast;
 	bool setup;
 	u32 cur_num_vqs;
 	struct notifier_block nb;
@@ -1383,21 +1384,33 @@ static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
 	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_flow_act flow_act = {};
 	struct mlx5_flow_namespace *ns;
+	struct mlx5_flow_spec *spec;
+	void *headers_c;
+	void *headers_v;
+	u8 *dmac_c;
+	u8 *dmac_v;
 	int err;
 
-	/* for now, one entry, match all, forward to tir */
-	ft_attr.max_fte = 1;
-	ft_attr.autogroup.max_num_groups = 1;
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+	ft_attr.max_fte = 2;
+	ft_attr.autogroup.max_num_groups = 2;
 
 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
 	if (!ns) {
-		mlx5_vdpa_warn(&ndev->mvdev, "get flow namespace\n");
-		return -EOPNOTSUPP;
+		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
+		err = -EOPNOTSUPP;
+		goto err_ns;
 	}
 
 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
-	if (IS_ERR(ndev->rxft))
-		return PTR_ERR(ndev->rxft);
+	if (IS_ERR(ndev->rxft)) {
+		err = PTR_ERR(ndev->rxft);
+		goto err_ns;
+	}
 
 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
 	if (IS_ERR(ndev->rx_counter)) {
@@ -1405,37 +1418,64 @@ static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
 		goto err_fc;
 	}
 
+	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
+	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
+	memset(dmac_c, 0xff, ETH_ALEN);
+	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
+	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
+	ether_addr_copy(dmac_v, ndev->config.mac);
+
 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest[0].tir_num = ndev->res.tirn;
 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
-	ndev->rx_rule = mlx5_add_flow_rules(ndev->rxft, NULL, &flow_act, dest, 2);
-	if (IS_ERR(ndev->rx_rule)) {
-		err = PTR_ERR(ndev->rx_rule);
-		ndev->rx_rule = NULL;
-		goto err_rule;
+	ndev->rx_rule_ucast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 2);
+
+	if (IS_ERR(ndev->rx_rule_ucast)) {
+		err = PTR_ERR(ndev->rx_rule_ucast);
+		ndev->rx_rule_ucast = NULL;
+		goto err_rule_ucast;
 	}
 
+	memset(dmac_c, 0, ETH_ALEN);
+	memset(dmac_v, 0, ETH_ALEN);
+	dmac_c[0] = 1;
+	dmac_v[0] = 1;
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	ndev->rx_rule_mcast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 1);
+	if (IS_ERR(ndev->rx_rule_mcast)) {
+		err = PTR_ERR(ndev->rx_rule_mcast);
+		ndev->rx_rule_mcast = NULL;
+		goto err_rule_mcast;
+	}
+
+	kvfree(spec);
 	return 0;
 
-err_rule:
+err_rule_mcast:
+	mlx5_del_flow_rules(ndev->rx_rule_ucast);
+	ndev->rx_rule_ucast = NULL;
+err_rule_ucast:
 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
 err_fc:
 	mlx5_destroy_flow_table(ndev->rxft);
+err_ns:
+	kvfree(spec);
 	return err;
 }
 
 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
 {
-	if (!ndev->rx_rule)
+	if (!ndev->rx_rule_ucast)
 		return;
 
-	mlx5_del_flow_rules(ndev->rx_rule);
+	mlx5_del_flow_rules(ndev->rx_rule_mcast);
+	ndev->rx_rule_mcast = NULL;
+	mlx5_del_flow_rules(ndev->rx_rule_ucast);
+	ndev->rx_rule_ucast = NULL;
 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
 	mlx5_destroy_flow_table(ndev->rxft);
-
-	ndev->rx_rule = NULL;
 }
 
 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)