mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-12-02 16:44:10 +08:00
7fa005caa3
Now that vfio_pci has been split into two source modules, one focusing on the "struct pci_driver" (vfio_pci.c) and a toolbox library of code (vfio_pci_core.c), complete the split and move them into two different kernel modules. As before vfio_pci.ko continues to present the same interface under sysfs and this change will have no functional impact. Splitting into another module and adding exports allows creating new HW specific VFIO PCI drivers that can implement device specific functionality, such as VFIO migration interfaces or specialized device requirements. Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Link: https://lore.kernel.org/r/20210826103912.128972-14-yishaih@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
501 lines
11 KiB
C
501 lines
11 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* VFIO PCI I/O Port & MMIO access
|
|
*
|
|
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
|
* Author: Alex Williamson <alex.williamson@redhat.com>
|
|
*
|
|
* Derived from original vfio:
|
|
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
|
* Author: Tom Lyon, pugs@cisco.com
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/pci.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/io.h>
|
|
#include <linux/vfio.h>
|
|
#include <linux/vgaarb.h>
|
|
|
|
#include <linux/vfio_pci_core.h>
|
|
|
|
#ifdef __LITTLE_ENDIAN
|
|
#define vfio_ioread64 ioread64
|
|
#define vfio_iowrite64 iowrite64
|
|
#define vfio_ioread32 ioread32
|
|
#define vfio_iowrite32 iowrite32
|
|
#define vfio_ioread16 ioread16
|
|
#define vfio_iowrite16 iowrite16
|
|
#else
|
|
#define vfio_ioread64 ioread64be
|
|
#define vfio_iowrite64 iowrite64be
|
|
#define vfio_ioread32 ioread32be
|
|
#define vfio_iowrite32 iowrite32be
|
|
#define vfio_ioread16 ioread16be
|
|
#define vfio_iowrite16 iowrite16be
|
|
#endif
|
|
#define vfio_ioread8 ioread8
|
|
#define vfio_iowrite8 iowrite8
|
|
|
|
#define VFIO_IOWRITE(size) \
|
|
static int vfio_pci_iowrite##size(struct vfio_pci_core_device *vdev, \
|
|
bool test_mem, u##size val, void __iomem *io) \
|
|
{ \
|
|
if (test_mem) { \
|
|
down_read(&vdev->memory_lock); \
|
|
if (!__vfio_pci_memory_enabled(vdev)) { \
|
|
up_read(&vdev->memory_lock); \
|
|
return -EIO; \
|
|
} \
|
|
} \
|
|
\
|
|
vfio_iowrite##size(val, io); \
|
|
\
|
|
if (test_mem) \
|
|
up_read(&vdev->memory_lock); \
|
|
\
|
|
return 0; \
|
|
}
|
|
|
|
VFIO_IOWRITE(8)
|
|
VFIO_IOWRITE(16)
|
|
VFIO_IOWRITE(32)
|
|
#ifdef iowrite64
|
|
VFIO_IOWRITE(64)
|
|
#endif
|
|
|
|
#define VFIO_IOREAD(size) \
|
|
static int vfio_pci_ioread##size(struct vfio_pci_core_device *vdev, \
|
|
bool test_mem, u##size *val, void __iomem *io) \
|
|
{ \
|
|
if (test_mem) { \
|
|
down_read(&vdev->memory_lock); \
|
|
if (!__vfio_pci_memory_enabled(vdev)) { \
|
|
up_read(&vdev->memory_lock); \
|
|
return -EIO; \
|
|
} \
|
|
} \
|
|
\
|
|
*val = vfio_ioread##size(io); \
|
|
\
|
|
if (test_mem) \
|
|
up_read(&vdev->memory_lock); \
|
|
\
|
|
return 0; \
|
|
}
|
|
|
|
VFIO_IOREAD(8)
|
|
VFIO_IOREAD(16)
|
|
VFIO_IOREAD(32)
|
|
|
|
/*
|
|
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
|
|
* range which is inaccessible. The excluded range drops writes and fills
|
|
* reads with -1. This is intended for handling MSI-X vector tables and
|
|
* leftover space for ROM BARs.
|
|
*/
|
|
static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
|
|
void __iomem *io, char __user *buf,
|
|
loff_t off, size_t count, size_t x_start,
|
|
size_t x_end, bool iswrite)
|
|
{
|
|
ssize_t done = 0;
|
|
int ret;
|
|
|
|
while (count) {
|
|
size_t fillable, filled;
|
|
|
|
if (off < x_start)
|
|
fillable = min(count, (size_t)(x_start - off));
|
|
else if (off >= x_end)
|
|
fillable = count;
|
|
else
|
|
fillable = 0;
|
|
|
|
if (fillable >= 4 && !(off % 4)) {
|
|
u32 val;
|
|
|
|
if (iswrite) {
|
|
if (copy_from_user(&val, buf, 4))
|
|
return -EFAULT;
|
|
|
|
ret = vfio_pci_iowrite32(vdev, test_mem,
|
|
val, io + off);
|
|
if (ret)
|
|
return ret;
|
|
} else {
|
|
ret = vfio_pci_ioread32(vdev, test_mem,
|
|
&val, io + off);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (copy_to_user(buf, &val, 4))
|
|
return -EFAULT;
|
|
}
|
|
|
|
filled = 4;
|
|
} else if (fillable >= 2 && !(off % 2)) {
|
|
u16 val;
|
|
|
|
if (iswrite) {
|
|
if (copy_from_user(&val, buf, 2))
|
|
return -EFAULT;
|
|
|
|
ret = vfio_pci_iowrite16(vdev, test_mem,
|
|
val, io + off);
|
|
if (ret)
|
|
return ret;
|
|
} else {
|
|
ret = vfio_pci_ioread16(vdev, test_mem,
|
|
&val, io + off);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (copy_to_user(buf, &val, 2))
|
|
return -EFAULT;
|
|
}
|
|
|
|
filled = 2;
|
|
} else if (fillable) {
|
|
u8 val;
|
|
|
|
if (iswrite) {
|
|
if (copy_from_user(&val, buf, 1))
|
|
return -EFAULT;
|
|
|
|
ret = vfio_pci_iowrite8(vdev, test_mem,
|
|
val, io + off);
|
|
if (ret)
|
|
return ret;
|
|
} else {
|
|
ret = vfio_pci_ioread8(vdev, test_mem,
|
|
&val, io + off);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (copy_to_user(buf, &val, 1))
|
|
return -EFAULT;
|
|
}
|
|
|
|
filled = 1;
|
|
} else {
|
|
/* Fill reads with -1, drop writes */
|
|
filled = min(count, (size_t)(x_end - off));
|
|
if (!iswrite) {
|
|
u8 val = 0xFF;
|
|
size_t i;
|
|
|
|
for (i = 0; i < filled; i++)
|
|
if (copy_to_user(buf + i, &val, 1))
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
|
|
count -= filled;
|
|
done += filled;
|
|
off += filled;
|
|
buf += filled;
|
|
}
|
|
|
|
return done;
|
|
}
|
|
|
|
static int vfio_pci_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
|
|
{
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
int ret;
|
|
void __iomem *io;
|
|
|
|
if (vdev->barmap[bar])
|
|
return 0;
|
|
|
|
ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
|
|
if (ret)
|
|
return ret;
|
|
|
|
io = pci_iomap(pdev, bar, 0);
|
|
if (!io) {
|
|
pci_release_selected_regions(pdev, 1 << bar);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
vdev->barmap[bar] = io;
|
|
|
|
return 0;
|
|
}
|
|
|
|
ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
|
|
size_t count, loff_t *ppos, bool iswrite)
|
|
{
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
|
int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
|
size_t x_start = 0, x_end = 0;
|
|
resource_size_t end;
|
|
void __iomem *io;
|
|
struct resource *res = &vdev->pdev->resource[bar];
|
|
ssize_t done;
|
|
|
|
if (pci_resource_start(pdev, bar))
|
|
end = pci_resource_len(pdev, bar);
|
|
else if (bar == PCI_ROM_RESOURCE &&
|
|
pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
|
|
end = 0x20000;
|
|
else
|
|
return -EINVAL;
|
|
|
|
if (pos >= end)
|
|
return -EINVAL;
|
|
|
|
count = min(count, (size_t)(end - pos));
|
|
|
|
if (bar == PCI_ROM_RESOURCE) {
|
|
/*
|
|
* The ROM can fill less space than the BAR, so we start the
|
|
* excluded range at the end of the actual ROM. This makes
|
|
* filling large ROM BARs much faster.
|
|
*/
|
|
io = pci_map_rom(pdev, &x_start);
|
|
if (!io) {
|
|
done = -ENOMEM;
|
|
goto out;
|
|
}
|
|
x_end = end;
|
|
} else {
|
|
int ret = vfio_pci_setup_barmap(vdev, bar);
|
|
if (ret) {
|
|
done = ret;
|
|
goto out;
|
|
}
|
|
|
|
io = vdev->barmap[bar];
|
|
}
|
|
|
|
if (bar == vdev->msix_bar) {
|
|
x_start = vdev->msix_offset;
|
|
x_end = vdev->msix_offset + vdev->msix_size;
|
|
}
|
|
|
|
done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
|
|
count, x_start, x_end, iswrite);
|
|
|
|
if (done >= 0)
|
|
*ppos += done;
|
|
|
|
if (bar == PCI_ROM_RESOURCE)
|
|
pci_unmap_rom(pdev, io);
|
|
out:
|
|
return done;
|
|
}
|
|
|
|
ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
|
|
size_t count, loff_t *ppos, bool iswrite)
|
|
{
|
|
int ret;
|
|
loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK;
|
|
void __iomem *iomem = NULL;
|
|
unsigned int rsrc;
|
|
bool is_ioport;
|
|
ssize_t done;
|
|
|
|
if (!vdev->has_vga)
|
|
return -EINVAL;
|
|
|
|
if (pos > 0xbfffful)
|
|
return -EINVAL;
|
|
|
|
switch ((u32)pos) {
|
|
case 0xa0000 ... 0xbffff:
|
|
count = min(count, (size_t)(0xc0000 - pos));
|
|
iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
|
|
off = pos - 0xa0000;
|
|
rsrc = VGA_RSRC_LEGACY_MEM;
|
|
is_ioport = false;
|
|
break;
|
|
case 0x3b0 ... 0x3bb:
|
|
count = min(count, (size_t)(0x3bc - pos));
|
|
iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1);
|
|
off = pos - 0x3b0;
|
|
rsrc = VGA_RSRC_LEGACY_IO;
|
|
is_ioport = true;
|
|
break;
|
|
case 0x3c0 ... 0x3df:
|
|
count = min(count, (size_t)(0x3e0 - pos));
|
|
iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1);
|
|
off = pos - 0x3c0;
|
|
rsrc = VGA_RSRC_LEGACY_IO;
|
|
is_ioport = true;
|
|
break;
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!iomem)
|
|
return -ENOMEM;
|
|
|
|
ret = vga_get_interruptible(vdev->pdev, rsrc);
|
|
if (ret) {
|
|
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* VGA MMIO is a legacy, non-BAR resource that hopefully allows
|
|
* probing, so we don't currently worry about access in relation
|
|
* to the memory enable bit in the command register.
|
|
*/
|
|
done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
|
|
|
|
vga_put(vdev->pdev, rsrc);
|
|
|
|
is_ioport ? ioport_unmap(iomem) : iounmap(iomem);
|
|
|
|
if (done >= 0)
|
|
*ppos += done;
|
|
|
|
return done;
|
|
}
|
|
|
|
static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
|
|
bool test_mem)
|
|
{
|
|
switch (ioeventfd->count) {
|
|
case 1:
|
|
vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
|
|
ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
case 2:
|
|
vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
|
|
ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
case 4:
|
|
vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
|
|
ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
#ifdef iowrite64
|
|
case 8:
|
|
vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
|
|
ioeventfd->data, ioeventfd->addr);
|
|
break;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
|
|
{
|
|
struct vfio_pci_ioeventfd *ioeventfd = opaque;
|
|
struct vfio_pci_core_device *vdev = ioeventfd->vdev;
|
|
|
|
if (ioeventfd->test_mem) {
|
|
if (!down_read_trylock(&vdev->memory_lock))
|
|
return 1; /* Lock contended, use thread */
|
|
if (!__vfio_pci_memory_enabled(vdev)) {
|
|
up_read(&vdev->memory_lock);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
vfio_pci_ioeventfd_do_write(ioeventfd, false);
|
|
|
|
if (ioeventfd->test_mem)
|
|
up_read(&vdev->memory_lock);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
|
|
{
|
|
struct vfio_pci_ioeventfd *ioeventfd = opaque;
|
|
|
|
vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
|
|
}
|
|
|
|
long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
|
|
uint64_t data, int count, int fd)
|
|
{
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
|
|
int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
|
|
struct vfio_pci_ioeventfd *ioeventfd;
|
|
|
|
/* Only support ioeventfds into BARs */
|
|
if (bar > VFIO_PCI_BAR5_REGION_INDEX)
|
|
return -EINVAL;
|
|
|
|
if (pos + count > pci_resource_len(pdev, bar))
|
|
return -EINVAL;
|
|
|
|
/* Disallow ioeventfds working around MSI-X table writes */
|
|
if (bar == vdev->msix_bar &&
|
|
!(pos + count <= vdev->msix_offset ||
|
|
pos >= vdev->msix_offset + vdev->msix_size))
|
|
return -EINVAL;
|
|
|
|
#ifndef iowrite64
|
|
if (count == 8)
|
|
return -EINVAL;
|
|
#endif
|
|
|
|
ret = vfio_pci_setup_barmap(vdev, bar);
|
|
if (ret)
|
|
return ret;
|
|
|
|
mutex_lock(&vdev->ioeventfds_lock);
|
|
|
|
list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
|
|
if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
|
|
ioeventfd->data == data && ioeventfd->count == count) {
|
|
if (fd == -1) {
|
|
vfio_virqfd_disable(&ioeventfd->virqfd);
|
|
list_del(&ioeventfd->next);
|
|
vdev->ioeventfds_nr--;
|
|
kfree(ioeventfd);
|
|
ret = 0;
|
|
} else
|
|
ret = -EEXIST;
|
|
|
|
goto out_unlock;
|
|
}
|
|
}
|
|
|
|
if (fd < 0) {
|
|
ret = -ENODEV;
|
|
goto out_unlock;
|
|
}
|
|
|
|
if (vdev->ioeventfds_nr >= VFIO_PCI_IOEVENTFD_MAX) {
|
|
ret = -ENOSPC;
|
|
goto out_unlock;
|
|
}
|
|
|
|
ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
|
|
if (!ioeventfd) {
|
|
ret = -ENOMEM;
|
|
goto out_unlock;
|
|
}
|
|
|
|
ioeventfd->vdev = vdev;
|
|
ioeventfd->addr = vdev->barmap[bar] + pos;
|
|
ioeventfd->data = data;
|
|
ioeventfd->pos = pos;
|
|
ioeventfd->bar = bar;
|
|
ioeventfd->count = count;
|
|
ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
|
|
|
|
ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
|
|
vfio_pci_ioeventfd_thread, NULL,
|
|
&ioeventfd->virqfd, fd);
|
|
if (ret) {
|
|
kfree(ioeventfd);
|
|
goto out_unlock;
|
|
}
|
|
|
|
list_add(&ioeventfd->next, &vdev->ioeventfds_list);
|
|
vdev->ioeventfds_nr++;
|
|
|
|
out_unlock:
|
|
mutex_unlock(&vdev->ioeventfds_lock);
|
|
|
|
return ret;
|
|
}
|